mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 22:14:37 +00:00
Actor: Initial implementation
Signed-off-by: Václav Vančura <commit@vancura.dev> Signed-off-by: Adam Kliment <adam@netmilk.net>
This commit is contained in:
parent
235ae8765d
commit
4d13bb2650
5
.actor/.dockerignore
Normal file
5
.actor/.dockerignore
Normal file
@ -0,0 +1,5 @@
|
||||
.git
|
||||
.gitignore
|
||||
*.pyc
|
||||
__pycache__
|
||||
*.log.git
|
22
.actor/Dockerfile
Normal file
22
.actor/Dockerfile
Normal file
@ -0,0 +1,22 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y file procps curl gpg
|
||||
|
||||
RUN mkdir -p /etc/apt/keyrings && \
|
||||
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
|
||||
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list
|
||||
|
||||
RUN apt-get update && apt-get install -y nodejs bash git jq jo xz-utils && apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install --no-cache-dir docling
|
||||
|
||||
RUN npm install -g apify-cli && npm cache clean --force
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN mkdir -p /tmp/runtime-root && chmod 0700 /tmp/runtime-root
|
||||
|
||||
COPY .actor/ .actor/
|
||||
COPY . .
|
||||
|
||||
ENTRYPOINT [".actor/actor.sh"]
|
13
.actor/actor.json
Normal file
13
.actor/actor.json
Normal file
@ -0,0 +1,13 @@
|
||||
{
|
||||
"actorSpecification": 1,
|
||||
"name": "docling",
|
||||
"version": "0.0",
|
||||
"buildTag": "latest",
|
||||
"environmentVariables": {},
|
||||
"dockerFile": "./Dockerfile",
|
||||
"dockerContext": "../",
|
||||
"input": "./input_schema.json",
|
||||
"scripts": {
|
||||
"run": "./actor.sh"
|
||||
}
|
||||
}
|
99
.actor/actor.sh
Executable file
99
.actor/actor.sh
Executable file
@ -0,0 +1,99 @@
|
||||
#!/bin/bash
|
||||
|
||||
# --- Setup Error Handling ---
|
||||
|
||||
trap 'echo "Error on line $LINENO"; exit 1' ERR
|
||||
set -e
|
||||
|
||||
# --- Validate Docling Installation ---
|
||||
|
||||
if ! command -v docling &> /dev/null; then
|
||||
echo "Error: Docling CLI is not installed or not in PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- Input Parsing ---
|
||||
|
||||
echo "Parsing actor input..."
|
||||
INPUT=$(apify actor:get-input || { echo "Failed to get input"; exit 1; })
|
||||
|
||||
DOCUMENT_URL=$(echo "$INPUT" | jq -r '.documentUrl')
|
||||
OUTPUT_FORMAT=$(echo "$INPUT" | jq -r '.outputFormat')
|
||||
OUTPUT_NAME="output_file.$OUTPUT_FORMAT"
|
||||
|
||||
if [ -z "$DOCUMENT_URL" ]; then
|
||||
echo "Error: Missing document URL. Please provide 'documentUrl' in the input"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$OUTPUT_FORMAT" ]; then
|
||||
OUTPUT_FORMAT="md"
|
||||
echo "No output format specified. Defaulting to 'md'"
|
||||
fi
|
||||
|
||||
case "$OUTPUT_FORMAT" in
|
||||
md|json|html|text|doctags)
|
||||
;;
|
||||
*)
|
||||
echo "Error: Invalid output format '$OUTPUT_FORMAT'. Supported formats are 'md', 'json', 'html', 'text', and 'doctags'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# --- Build Docling Command ---
|
||||
|
||||
DOC_CONVERT_CMD="docling --verbose $DOCUMENT_URL --to $OUTPUT_FORMAT"
|
||||
|
||||
if [ "$(echo "$INPUT" | jq -r '.ocr')" = "true" ]; then
|
||||
DOC_CONVERT_CMD="$DOC_CONVERT_CMD --ocr"
|
||||
fi
|
||||
|
||||
# --- Process Document with Docling ---
|
||||
|
||||
echo "Processing document with Docling CLI..."
|
||||
echo "Running: $DOC_CONVERT_CMD"
|
||||
|
||||
touch docling.timestamp
|
||||
|
||||
$DOC_CONVERT_CMD > docling.log 2>&1 || {
|
||||
echo "Error: Docling CLI failed. Check 'docling.log' for details";
|
||||
cat docling.log;
|
||||
exit 1;
|
||||
}
|
||||
|
||||
GENERATED_FILE=$(find . -type f -name "*.$OUTPUT_FORMAT" -newer docling.timestamp)
|
||||
|
||||
if [ -z "$GENERATED_FILE" ]; then
|
||||
echo "Error: Could not find generated output file with extension .$OUTPUT_FORMAT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mv "$GENERATED_FILE" "$OUTPUT_NAME"
|
||||
|
||||
# --- Validate Output ---
|
||||
|
||||
if [ ! -f "$OUTPUT_NAME" ]; then
|
||||
echo "Error: Expected output file '$OUTPUT_NAME' was not generated"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -s "$OUTPUT_NAME" ]; then
|
||||
echo "Error: Generated output file '$OUTPUT_NAME' is empty"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Document successfully processed and exported as '$OUTPUT_FORMAT' to file: $OUTPUT_NAME"
|
||||
|
||||
# --- Store Output in Key-Value Store ---
|
||||
|
||||
echo "Pushing processed document to Key-Value Store (record key: OUTPUT_RESULT)..."
|
||||
apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" < "$OUTPUT_NAME" || {
|
||||
echo "Error: Failed to push the output document to the Key-Value Store"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# --- Cleanup Temporary Files ---
|
||||
|
||||
rm -f docling.timestamp docling.log || true
|
||||
|
||||
echo "Done!"
|
30
.actor/input_schema.json
Normal file
30
.actor/input_schema.json
Normal file
@ -0,0 +1,30 @@
|
||||
{
|
||||
"title": "Docling Actor Input",
|
||||
"description": "Options for running Docling CLI on the Apify platform.",
|
||||
"type": "object",
|
||||
"schemaVersion": 1,
|
||||
"properties": {
|
||||
"documentUrl": {
|
||||
"title": "Document URL",
|
||||
"type": "string",
|
||||
"description": "URL of the document to process with Docling. Supported formats: images, 'pdf', 'docx', 'pptx', 'xlsx, 'html', 'md', 'xml_pubmed', 'asciidoc', 'xml_uspto'.",
|
||||
"prefill": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf",
|
||||
"editor": "textfield"
|
||||
},
|
||||
"outputFormat": {
|
||||
"title": "Output Format",
|
||||
"type": "string",
|
||||
"description": "Specifies the desired output format after processing the document. Supported formats: 'md', 'json', 'html', 'text', 'doctags'.",
|
||||
"enum": ["md", "json", "html", "text", "doctags"],
|
||||
"default": "md",
|
||||
"editor": "select"
|
||||
},
|
||||
"ocr": {
|
||||
"title": "Enable OCR",
|
||||
"type": "boolean",
|
||||
"description": "If true, OCR will be applied to scanned PDFs for text recognition.",
|
||||
"default": true
|
||||
}
|
||||
},
|
||||
"required": ["documentUrl"]
|
||||
}
|
Loading…
Reference in New Issue
Block a user