mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Actor: Replace Docling CLI with docling-serve API
This commit transitions the Actor from using the full Docling CLI package to the more lightweight docling-serve API. Key changes include: - Redesign Dockerfile to use docling-serve as base image - Update actor.sh to communicate with API instead of running CLI commands - Improve content type handling for various output formats - Update input schema to align with API parameters - Reduce Docker image size from ~6GB to ~600MB - Update documentation and changelog to reflect architectural changes The image size reduction will make the Actor more cost-effective for users while maintaining all existing functionality including OCR capabilities. Issue: No official docling-serve Docker image is currently available, which will be addressed in a future commit. Signed-off-by: Václav Vančura <commit@vancura.dev>
This commit is contained in:
parent
11f2960907
commit
9f86971fad
@ -5,6 +5,24 @@ All notable changes to the Docling Actor will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [1.1.0] - 2025-03-15
|
||||
|
||||
### Changed
|
||||
|
||||
- Switched from full Docling CLI to docling-serve API
|
||||
- Dramatically reduced Docker image size (from ~6GB to ~600MB)
|
||||
- Improved API compatibility with docling-serve
|
||||
- Better content type handling for different output formats
|
||||
- Updated error handling to align with API responses
|
||||
|
||||
### Technical Details
|
||||
|
||||
- Actor Specification v1
|
||||
- Using ds4sd/docling-serve:latest base image
|
||||
- Node.js 20.x for Apify CLI
|
||||
- Eliminated Python dependencies
|
||||
- Simplified Docker build process
|
||||
|
||||
## [1.0.0] - 2025-02-07
|
||||
|
||||
### Added
|
||||
|
@ -1,57 +1,36 @@
|
||||
ARG ACTOR_PATH_IN_DOCKER_CONTEXT
|
||||
|
||||
FROM python:3.11-slim-bookworm
|
||||
FROM ds4sd/docling-serve:latest
|
||||
|
||||
LABEL maintainer="Vaclav Vancura <@vancura>"
|
||||
LABEL description="Apify Actor for document processing using Docling"
|
||||
LABEL version="1.0.0"
|
||||
LABEL version="1.1.0"
|
||||
|
||||
RUN groupadd -r appuser && useradd -r -g appuser -s /sbin/nologin appuser && \
|
||||
\
|
||||
apt-get update && apt-get install -y --no-install-recommends bash curl file git gpg jo jq procps xz-utils && \
|
||||
\
|
||||
mkdir -p /etc/apt/keyrings && \
|
||||
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
|
||||
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \
|
||||
\
|
||||
apt-get update && apt-get install -y nodejs && apt-get clean && \
|
||||
\
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
\
|
||||
pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir docling==2.17.0 && \
|
||||
\
|
||||
npm install -g npm@latest && \
|
||||
npm install -g apify-cli && \
|
||||
npm cache clean --force && \
|
||||
\
|
||||
mkdir -p /home/appuser && \
|
||||
chown -R appuser:appuser /home/appuser && \
|
||||
\
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
time \
|
||||
# Install necessary dependencies for the Apify Actor
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
gpg \
|
||||
jq \
|
||||
xz-utils \
|
||||
jo \
|
||||
procps \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Node.js for Apify CLI
|
||||
RUN mkdir -p /etc/apt/keyrings && \
|
||||
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
|
||||
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \
|
||||
apt-get update && apt-get install -y nodejs && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
npm install -g npm@latest && \
|
||||
npm install -g apify-cli && \
|
||||
npm cache clean --force
|
||||
|
||||
# Create directories and set permissions
|
||||
WORKDIR /app
|
||||
|
||||
RUN mkdir -p /tmp/runtime-root && \
|
||||
chmod 0700 /tmp/runtime-root && \
|
||||
chown -R appuser:appuser /tmp/runtime-root /app
|
||||
# Copy actor files
|
||||
COPY --chown=1000:1000 .actor/ .actor/
|
||||
|
||||
COPY --chown=appuser:appuser .actor/ .actor/
|
||||
COPY --chown=appuser:appuser . .
|
||||
|
||||
USER appuser
|
||||
|
||||
# Set environment variables for better resource management
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV MALLOC_ARENA_MAX=2
|
||||
ENV EASYOCR_DOWNLOAD_CACHE="/tmp/easyocr"
|
||||
|
||||
# Create cache directory for EasyOCR
|
||||
RUN mkdir -p /tmp/easyocr && \
|
||||
chown -R appuser:appuser /tmp/easyocr
|
||||
# Security best practice: run as non-root user (docling-serve already uses a non-root user)
|
||||
USER 1000
|
||||
|
||||
ENTRYPOINT [".actor/actor.sh"]
|
||||
|
@ -17,14 +17,14 @@ This Actor (specification v1) wraps the [Docling project](https://ds4sd.github.i
|
||||
5. [Performance & Resources](#performance--resources)
|
||||
6. [Troubleshooting](#troubleshooting)
|
||||
7. [Local Development](#local-development)
|
||||
8. [Requirements & Installation](#requirements--installation)
|
||||
8. [Architecture](#architecture)
|
||||
9. [License](#license)
|
||||
10. [Acknowledgments](#acknowledgments)
|
||||
11. [Security Considerations](#security-considerations)
|
||||
|
||||
## Features
|
||||
|
||||
- Runs Docling v2.17.0 in a fully managed environment on Apify
|
||||
- Leverages the lightweight docling-serve API for efficient document processing
|
||||
- Processes multiple document formats:
|
||||
- PDF documents (scanned or digital)
|
||||
- Microsoft Office files (DOCX, XLSX, PPTX)
|
||||
@ -169,11 +169,10 @@ Content of section 2...
|
||||
|
||||
The Actor maintains detailed processing logs including:
|
||||
|
||||
- Memory usage statistics
|
||||
- API request and response details
|
||||
- Processing steps and timing
|
||||
- Error messages and stack traces
|
||||
- Input validation results
|
||||
- OCR processing details (when enabled)
|
||||
|
||||
Access logs via:
|
||||
|
||||
@ -183,18 +182,14 @@ apify key-value-stores get-record DOCLING_LOG
|
||||
|
||||
## Performance & Resources
|
||||
|
||||
- **Docker Image Size**: ~6 GB (includes OCR libraries and ML models)
|
||||
- **Docker Image Size**: ~600 MB
|
||||
- **Memory Requirements**:
|
||||
- Minimum: 4 GB RAM
|
||||
- Recommended: 8 GB RAM for large documents
|
||||
- **Memory Monitoring**:
|
||||
- Real-time memory usage tracking during processing
|
||||
- Detailed memory statistics in `DOCLING_LOG`
|
||||
- Automatic failure detection for out-of-memory situations
|
||||
- Minimum: 2 GB RAM
|
||||
- Recommended: 4 GB RAM for large or complex documents
|
||||
- **Processing Time**:
|
||||
- Simple documents: 30-60 seconds
|
||||
- Complex PDFs with OCR: 2-5 minutes
|
||||
- Large documents (100+ pages): 5-15 minutes
|
||||
- Simple documents: 15-30 seconds
|
||||
- Complex PDFs with OCR: 1-3 minutes
|
||||
- Large documents (100+ pages): 3-10 minutes
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
@ -210,10 +205,10 @@ Common issues and solutions:
|
||||
- Check if the image quality is sufficient
|
||||
- Try processing with OCR disabled
|
||||
|
||||
3. **Memory Issues**
|
||||
- For large documents, try splitting them into smaller chunks
|
||||
- Consider using a higher-memory compute unit
|
||||
- Disable OCR if not strictly necessary
|
||||
3. **API Response Issues**
|
||||
- Check the logs for detailed error messages
|
||||
- Ensure the document format is supported
|
||||
- Verify the URL is correctly formatted
|
||||
|
||||
4. **Output Format Issues**
|
||||
- Verify the output format is supported
|
||||
@ -227,7 +222,6 @@ The Actor implements comprehensive error handling:
|
||||
- Input validation for document URLs and parameters
|
||||
- Detailed error messages in `DOCLING_LOG`
|
||||
- Proper exit codes for different failure scenarios
|
||||
- Memory monitoring and out-of-memory detection
|
||||
- Automatic cleanup on failure
|
||||
- Dataset records with processing status
|
||||
|
||||
@ -242,7 +236,6 @@ If you wish to develop or modify this Actor locally:
|
||||
- `actor.json` - Actor configuration and metadata
|
||||
- `actor.sh` - Main execution script
|
||||
- `input_schema.json` - Input parameter definitions
|
||||
- `.dockerignore` - Build optimization rules
|
||||
4. Run the Actor locally using:
|
||||
|
||||
```bash
|
||||
@ -257,30 +250,24 @@ If you wish to develop or modify this Actor locally:
|
||||
├── actor.json # Actor metadata
|
||||
├── actor.sh # Execution script
|
||||
├── input_schema.json # Input parameters
|
||||
├── .dockerignore # Build exclusions
|
||||
└── README.md # This documentation
|
||||
```
|
||||
|
||||
## Requirements & Installation
|
||||
## Architecture
|
||||
|
||||
- An [Apify account](https://console.apify.com/?fpr=docling) (free tier available)
|
||||
- For local development:
|
||||
- Docker installed
|
||||
- Apify CLI (`npm install -g apify-cli`)
|
||||
- Git for version control
|
||||
- The Actor's Docker image (~6 GB) includes:
|
||||
- Python 3.11 with optimized caching (.pyc, .pyo excluded)
|
||||
- Node.js 20.x
|
||||
- Docling v2.17.0 and its dependencies
|
||||
- OCR libraries and ML models
|
||||
This Actor uses a lightweight architecture based on the official `ds4sd/docling-serve` Docker image:
|
||||
|
||||
### Build Optimizations
|
||||
|
||||
The Actor uses several optimizations to maintain efficiency:
|
||||
|
||||
- Python cache files (`pycache`, `.pyc`, `.pyo`, `.pyd`) are excluded
|
||||
- Development artifacts (`.git`, `.env`, `.venv`) are ignored
|
||||
- Log and test files (`*.log`, `.pytest_cache`, `.coverage`) are excluded from builds
|
||||
- **Base Image**: `ds4sd/docling-serve:latest` (~600MB)
|
||||
- **API Communication**: Uses the RESTful API provided by docling-serve on port 8080
|
||||
- **Request Flow**:
|
||||
1. Actor receives the input parameters
|
||||
2. Creates a JSON payload for the docling-serve API
|
||||
3. Makes a POST request to the /convert endpoint
|
||||
4. Processes the response and stores it in the key-value store
|
||||
- **Dependencies**:
|
||||
- Node.js for Apify CLI
|
||||
- Essential Linux tools (curl, jq, etc.)
|
||||
- **Security**: Runs as a non-root user for enhanced security
|
||||
|
||||
## License
|
||||
|
||||
@ -288,12 +275,12 @@ This wrapper project is under the MIT License, matching the original Docling lic
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
- [Docling](https://ds4sd.github.io/docling/) codebase by IBM
|
||||
- [Docling](https://ds4sd.github.io/docling/) and [docling-serve](https://github.com/DS4SD/docling-serve) by IBM
|
||||
- [Apify](https://apify.com/?fpr=docling) for the serverless actor environment
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- Actor runs under a non-root user (appuser) for enhanced security
|
||||
- Actor runs under a non-root user for enhanced security
|
||||
- Input URLs are validated before processing
|
||||
- Temporary files are securely managed and cleaned up
|
||||
- Process isolation through Docker containerization
|
||||
|
164
.actor/actor.sh
164
.actor/actor.sh
@ -17,13 +17,13 @@ exec 2> >(tee -a "$LOG_FILE" >&2)
|
||||
trap 'echo "Error on line $LINENO"' ERR
|
||||
set -e
|
||||
|
||||
# --- Validate Docling installation ---
|
||||
# --- Define error codes ---
|
||||
|
||||
# Check if Docling CLI is installed and in PATH.
|
||||
if ! command -v docling &>/dev/null; then
|
||||
echo "Error: Docling CLI is not installed or not in PATH"
|
||||
exit 1
|
||||
fi
|
||||
readonly ERR_INVALID_INPUT=10
|
||||
readonly ERR_URL_INACCESSIBLE=11
|
||||
readonly ERR_DOCLING_FAILED=12
|
||||
readonly ERR_OUTPUT_MISSING=13
|
||||
readonly ERR_STORAGE_FAILED=14
|
||||
|
||||
# --- Input parsing ---
|
||||
|
||||
@ -36,24 +36,10 @@ INPUT="$(apify actor:get-input || {
|
||||
|
||||
DOCUMENT_URL="$(echo "${INPUT}" | jq -r '.documentUrl')"
|
||||
OUTPUT_FORMAT="$(echo "${INPUT}" | jq -r '.outputFormat')"
|
||||
OUTPUT_NAME="output_file.${OUTPUT_FORMAT}"
|
||||
|
||||
# Define error codes.
|
||||
readonly ERR_INVALID_INPUT=10
|
||||
readonly ERR_URL_INACCESSIBLE=11
|
||||
readonly ERR_DOCLING_FAILED=12
|
||||
readonly ERR_OUTPUT_MISSING=13
|
||||
readonly ERR_STORAGE_FAILED=14
|
||||
|
||||
# Update error handling with codes.
|
||||
if [ -z "$DOCUMENT_URL" ]; then
|
||||
echo "Error: Missing document URL. Please provide 'documentUrl' in the input"
|
||||
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Missing document URL\"}" || true
|
||||
exit $ERR_INVALID_INPUT
|
||||
fi
|
||||
OCR_ENABLED="$(echo "${INPUT}" | jq -r '.ocr')"
|
||||
|
||||
# If no output format is specified, default to 'md'.
|
||||
if [ -z "$OUTPUT_FORMAT" ]; then
|
||||
if [ -z "$OUTPUT_FORMAT" ] || [ "$OUTPUT_FORMAT" = "null" ]; then
|
||||
OUTPUT_FORMAT="md"
|
||||
echo "No output format specified. Defaulting to 'md'"
|
||||
fi
|
||||
@ -62,10 +48,20 @@ fi
|
||||
case "$OUTPUT_FORMAT" in md | json | html | text | doctags) ;;
|
||||
*)
|
||||
echo "Error: Invalid output format '$OUTPUT_FORMAT'. Supported formats are 'md', 'json', 'html', 'text', and 'doctags'"
|
||||
exit 1
|
||||
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Invalid output format\"}" || true
|
||||
exit $ERR_INVALID_INPUT
|
||||
;;
|
||||
esac
|
||||
|
||||
# Set output filename based on format.
|
||||
OUTPUT_NAME="output_file.${OUTPUT_FORMAT}"
|
||||
|
||||
if [ -z "$DOCUMENT_URL" ] || [ "$DOCUMENT_URL" = "null" ]; then
|
||||
echo "Error: Missing document URL. Please provide 'documentUrl' in the input"
|
||||
apify actor:push-data "{\"status\": \"error\", \"error\": \"Missing document URL\"}" || true
|
||||
exit $ERR_INVALID_INPUT
|
||||
fi
|
||||
|
||||
# Validate URL is accessible.
|
||||
echo "Validating document URL..."
|
||||
if ! curl --output /dev/null --silent --head --fail "${DOCUMENT_URL}"; then
|
||||
@ -75,65 +71,81 @@ if ! curl --output /dev/null --silent --head --fail "${DOCUMENT_URL}"; then
|
||||
exit $ERR_URL_INACCESSIBLE
|
||||
fi
|
||||
|
||||
# --- Build Docling command ---
|
||||
# --- Create JSON payload for docling-serve API ---
|
||||
|
||||
DOC_CONVERT_CMD="docling --verbose '${DOCUMENT_URL}' --to '${OUTPUT_FORMAT}'"
|
||||
echo "Creating API request for docling-serve..."
|
||||
|
||||
# If OCR is enabled, add the OCR flag to the command.
|
||||
if [ "$(echo "${INPUT}" | jq -r '.ocr')" = "true" ]; then
|
||||
DOC_CONVERT_CMD="${DOC_CONVERT_CMD} --ocr"
|
||||
# Set OCR flag.
|
||||
if [ "$OCR_ENABLED" = "true" ]; then
|
||||
OCR_VALUE="true"
|
||||
else
|
||||
OCR_VALUE="false"
|
||||
fi
|
||||
|
||||
# Print the exact command that will be executed.
|
||||
echo "Debug: Command string: $DOC_CONVERT_CMD"
|
||||
echo "Debug: Full command: /usr/bin/time -v bash -c \"$DOC_CONVERT_CMD\""
|
||||
|
||||
# --- Process document with Docling ---
|
||||
|
||||
echo "Processing document with Docling CLI..."
|
||||
echo "Running: $DOC_CONVERT_CMD"
|
||||
|
||||
# Create a timestamp file to ensure the document is processed only once.
|
||||
TIMESTAMP_FILE="/tmp/docling.timestamp"
|
||||
touch "$TIMESTAMP_FILE" || {
|
||||
echo "Error: Failed to create timestamp file"
|
||||
exit 1
|
||||
# Create a temporary file for the JSON payload.
|
||||
REQUEST_FILE="/tmp/docling_request.json"
|
||||
cat > "$REQUEST_FILE" << EOF
|
||||
{
|
||||
"document_url": "${DOCUMENT_URL}",
|
||||
"output_format": "${OUTPUT_FORMAT}",
|
||||
"ocr": ${OCR_VALUE}
|
||||
}
|
||||
EOF
|
||||
|
||||
echo "Starting document processing with memory monitoring..."
|
||||
/usr/bin/time -v bash -c "${DOC_CONVERT_CMD}" 2>&1 | tee -a "$LOG_FILE"
|
||||
DOCLING_EXIT_CODE=${PIPESTATUS[0]}
|
||||
echo "Request payload:"
|
||||
cat "$REQUEST_FILE"
|
||||
|
||||
# Check if the command failed and handle the error.
|
||||
if [ $DOCLING_EXIT_CODE -ne 0 ]; then
|
||||
echo "Error: Docling command failed with exit code $DOCLING_EXIT_CODE"
|
||||
echo "Memory usage information:"
|
||||
free -h
|
||||
df -h
|
||||
# --- Call docling-serve API ---
|
||||
|
||||
echo "Calling docling-serve API (localhost:8080/convert)..."
|
||||
|
||||
RESPONSE_FILE="/tmp/docling_response.json"
|
||||
HTTP_CODE=$(curl -s -o "$RESPONSE_FILE" -w "%{http_code}" -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @"$REQUEST_FILE" \
|
||||
http://localhost:8080/convert)
|
||||
|
||||
echo "API Response Status Code: $HTTP_CODE"
|
||||
|
||||
# Check response status code.
|
||||
if [ "$HTTP_CODE" -ne 200 ]; then
|
||||
echo "Error: docling-serve API returned error code $HTTP_CODE"
|
||||
if [ -f "$RESPONSE_FILE" ]; then
|
||||
echo "Error response:"
|
||||
cat "$RESPONSE_FILE"
|
||||
fi
|
||||
|
||||
ERROR_MSG=$(jq -r '.error // "Unknown API error"' "$RESPONSE_FILE" 2>/dev/null || echo "Unknown API error")
|
||||
|
||||
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"${ERROR_MSG}\"}" || true
|
||||
exit $ERR_DOCLING_FAILED
|
||||
fi
|
||||
|
||||
GENERATED_FILE="$(find . -type f -name "*.${OUTPUT_FORMAT}" -newer "$TIMESTAMP_FILE")"
|
||||
# --- Process API response ---
|
||||
|
||||
# If no generated file is found, exit with an error.
|
||||
if [ -z "$GENERATED_FILE" ]; then
|
||||
echo "Error: Could not find generated output file with extension .$OUTPUT_FORMAT"
|
||||
echo "Processing API response..."
|
||||
|
||||
# Extract content from response and save to output file.
|
||||
if ! jq -r '.content' "$RESPONSE_FILE" > "$OUTPUT_NAME" 2>/dev/null; then
|
||||
echo "Error: Failed to parse API response or extract content"
|
||||
echo "Response content:"
|
||||
cat "$RESPONSE_FILE"
|
||||
|
||||
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Failed to parse API response\"}" || true
|
||||
exit $ERR_OUTPUT_MISSING
|
||||
fi
|
||||
|
||||
mv "${GENERATED_FILE}" "${OUTPUT_NAME}"
|
||||
|
||||
# --- Validate output ---
|
||||
|
||||
# If the output file is not found, exit with an error.
|
||||
# Validate output file.
|
||||
if [ ! -f "$OUTPUT_NAME" ]; then
|
||||
echo "Error: Expected output file '$OUTPUT_NAME' was not generated"
|
||||
echo "Error: Output file was not created"
|
||||
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Output file not created\"}" || true
|
||||
exit $ERR_OUTPUT_MISSING
|
||||
fi
|
||||
|
||||
# If the output file is empty, exit with an error.
|
||||
# Validate output file is not empty.
|
||||
if [ ! -s "$OUTPUT_NAME" ]; then
|
||||
echo "Error: Generated output file '$OUTPUT_NAME' is empty"
|
||||
echo "Error: Output file is empty"
|
||||
apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"status\": \"error\", \"error\": \"Output file is empty\"}" || true
|
||||
exit $ERR_OUTPUT_MISSING
|
||||
fi
|
||||
|
||||
@ -142,7 +154,18 @@ echo "Document successfully processed and exported as '$OUTPUT_FORMAT' to file:
|
||||
# --- Store output and log in key-value store ---
|
||||
|
||||
echo "Pushing processed document to key-value store (record key: OUTPUT_RESULT)..."
|
||||
apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" <"$OUTPUT_NAME" || {
|
||||
|
||||
CONTENT_TYPE=""
|
||||
case "$OUTPUT_FORMAT" in
|
||||
md) CONTENT_TYPE="text/markdown" ;;
|
||||
json) CONTENT_TYPE="application/json" ;;
|
||||
html) CONTENT_TYPE="text/html" ;;
|
||||
text) CONTENT_TYPE="text/plain" ;;
|
||||
doctags) CONTENT_TYPE="application/json" ;;
|
||||
*) CONTENT_TYPE="text/plain" ;;
|
||||
esac
|
||||
|
||||
apify actor:set-value "OUTPUT_RESULT" --contentType "$CONTENT_TYPE" < "$OUTPUT_NAME" || {
|
||||
echo "Error: Failed to push the output document to the key-value store"
|
||||
exit $ERR_STORAGE_FAILED
|
||||
}
|
||||
@ -154,24 +177,19 @@ apify actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"output_file\": \"${RESUL
|
||||
echo "Warning: Failed to push data to dataset"
|
||||
}
|
||||
|
||||
if [ -f "$LOG_FILE" ]; then
|
||||
if [ -s "$LOG_FILE" ]; then
|
||||
echo "Log file is not empty, pushing to key-value store (record key: DOCLING_LOG)..."
|
||||
# Store logs.
|
||||
if [ -f "$LOG_FILE" ] && [ -s "$LOG_FILE" ]; then
|
||||
echo "Pushing log file to key-value store (record key: DOCLING_LOG)..."
|
||||
apify actor:set-value "DOCLING_LOG" --contentType "text/plain" < "$LOG_FILE" || {
|
||||
echo "Warning: Failed to push the log file to the key-value store"
|
||||
}
|
||||
else
|
||||
echo "Warning: docling.log file exists but is empty"
|
||||
fi
|
||||
else
|
||||
echo "Warning: No docling.log file found"
|
||||
fi
|
||||
|
||||
# --- Cleanup temporary files ---
|
||||
|
||||
cleanup() {
|
||||
local exit_code=$?
|
||||
rm -f "$TIMESTAMP_FILE" || true
|
||||
rm -f "$REQUEST_FILE" "$RESPONSE_FILE" || true
|
||||
exit $exit_code
|
||||
}
|
||||
|
||||
|
@ -1,20 +1,20 @@
|
||||
{
|
||||
"title": "Docling Actor Input",
|
||||
"description": "Options for running Docling CLI on the Apify platform.",
|
||||
"description": "Options for processing documents with Docling via the docling-serve API.",
|
||||
"type": "object",
|
||||
"schemaVersion": 1,
|
||||
"properties": {
|
||||
"documentUrl": {
|
||||
"title": "Document URL",
|
||||
"type": "string",
|
||||
"description": "URL of the document to process with Docling. Supported formats: images, 'pdf', 'docx', 'pptx', 'xlsx, 'html', 'md', 'xml_pubmed', 'asciidoc', 'xml_uspto'.",
|
||||
"description": "URL of the document to process. Supported formats: PDF, DOCX, PPTX, XLSX, HTML, MD, XML, images, and more.",
|
||||
"prefill": "https://arxiv.org/pdf/2408.09869.pdf",
|
||||
"editor": "textfield"
|
||||
},
|
||||
"outputFormat": {
|
||||
"title": "Output Format",
|
||||
"type": "string",
|
||||
"description": "Specifies the desired output format after processing the document. Supported formats: 'md', 'json', 'html', 'text', 'doctags'.",
|
||||
"description": "Desired output format after processing the document.",
|
||||
"enum": ["md", "json", "html", "text", "doctags"],
|
||||
"default": "md",
|
||||
"editor": "select"
|
||||
@ -22,7 +22,7 @@
|
||||
"ocr": {
|
||||
"title": "Enable OCR",
|
||||
"type": "boolean",
|
||||
"description": "If true, OCR will be applied to scanned PDFs for text recognition.",
|
||||
"description": "If enabled, OCR will be applied to scanned documents for text recognition.",
|
||||
"default": true
|
||||
}
|
||||
},
|
||||
|
Loading…
Reference in New Issue
Block a user