diff --git a/.actor/.dockerignore b/.actor/.dockerignore new file mode 100644 index 00000000..2be69b0e --- /dev/null +++ b/.actor/.dockerignore @@ -0,0 +1,5 @@ +.git +.gitignore +*.pyc +__pycache__ +*.log.git diff --git a/.actor/Dockerfile b/.actor/Dockerfile new file mode 100644 index 00000000..16cd005c --- /dev/null +++ b/.actor/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.11-slim + +RUN apt-get update && apt-get install -y file procps curl gpg + +RUN mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \ + echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list + +RUN apt-get update && apt-get install -y nodejs bash git jq jo xz-utils && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir docling + +RUN npm install -g apify-cli && npm cache clean --force + +WORKDIR /app + +RUN mkdir -p /tmp/runtime-root && chmod 0700 /tmp/runtime-root + +COPY .actor/ .actor/ +COPY . . + +ENTRYPOINT [".actor/actor.sh"] diff --git a/.actor/actor.json b/.actor/actor.json new file mode 100644 index 00000000..4d9bb9d4 --- /dev/null +++ b/.actor/actor.json @@ -0,0 +1,13 @@ +{ + "actorSpecification": 1, + "name": "docling", + "version": "0.0", + "buildTag": "latest", + "environmentVariables": {}, + "dockerFile": "./Dockerfile", + "dockerContext": "../", + "input": "./input_schema.json", + "scripts": { + "run": "./actor.sh" + } +} diff --git a/.actor/actor.sh b/.actor/actor.sh new file mode 100755 index 00000000..cec63373 --- /dev/null +++ b/.actor/actor.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# --- Setup Error Handling --- + +trap 'echo "Error on line $LINENO"; exit 1' ERR +set -e + +# --- Validate Docling Installation --- + +if ! command -v docling &> /dev/null; then + echo "Error: Docling CLI is not installed or not in PATH" + exit 1 +fi + +# --- Input Parsing --- + +echo "Parsing actor input..." +INPUT=$(apify actor:get-input || { echo "Failed to get input"; exit 1; }) + +DOCUMENT_URL=$(echo "$INPUT" | jq -r '.documentUrl') +OUTPUT_FORMAT=$(echo "$INPUT" | jq -r '.outputFormat') +OUTPUT_NAME="output_file.$OUTPUT_FORMAT" + +if [ -z "$DOCUMENT_URL" ]; then + echo "Error: Missing document URL. Please provide 'documentUrl' in the input" + exit 1 +fi + +if [ -z "$OUTPUT_FORMAT" ]; then + OUTPUT_FORMAT="md" + echo "No output format specified. Defaulting to 'md'" +fi + +case "$OUTPUT_FORMAT" in + md|json|html|text|doctags) + ;; + *) + echo "Error: Invalid output format '$OUTPUT_FORMAT'. Supported formats are 'md', 'json', 'html', 'text', and 'doctags'" + exit 1 + ;; +esac + +# --- Build Docling Command --- + +DOC_CONVERT_CMD="docling --verbose $DOCUMENT_URL --to $OUTPUT_FORMAT" + +if [ "$(echo "$INPUT" | jq -r '.ocr')" = "true" ]; then + DOC_CONVERT_CMD="$DOC_CONVERT_CMD --ocr" +fi + +# --- Process Document with Docling --- + +echo "Processing document with Docling CLI..." +echo "Running: $DOC_CONVERT_CMD" + +touch docling.timestamp + +$DOC_CONVERT_CMD > docling.log 2>&1 || { + echo "Error: Docling CLI failed. Check 'docling.log' for details"; + cat docling.log; + exit 1; +} + +GENERATED_FILE=$(find . -type f -name "*.$OUTPUT_FORMAT" -newer docling.timestamp) + +if [ -z "$GENERATED_FILE" ]; then + echo "Error: Could not find generated output file with extension .$OUTPUT_FORMAT" + exit 1 +fi + +mv "$GENERATED_FILE" "$OUTPUT_NAME" + +# --- Validate Output --- + +if [ ! -f "$OUTPUT_NAME" ]; then + echo "Error: Expected output file '$OUTPUT_NAME' was not generated" + exit 1 +fi + +if [ ! -s "$OUTPUT_NAME" ]; then + echo "Error: Generated output file '$OUTPUT_NAME' is empty" + exit 1 +fi + +echo "Document successfully processed and exported as '$OUTPUT_FORMAT' to file: $OUTPUT_NAME" + +# --- Store Output in Key-Value Store --- + +echo "Pushing processed document to Key-Value Store (record key: OUTPUT_RESULT)..." +apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" < "$OUTPUT_NAME" || { + echo "Error: Failed to push the output document to the Key-Value Store" + exit 1 +} + +# --- Cleanup Temporary Files --- + +rm -f docling.timestamp docling.log || true + +echo "Done!" diff --git a/.actor/input_schema.json b/.actor/input_schema.json new file mode 100644 index 00000000..9c7c4031 --- /dev/null +++ b/.actor/input_schema.json @@ -0,0 +1,30 @@ +{ + "title": "Docling Actor Input", + "description": "Options for running Docling CLI on the Apify platform.", + "type": "object", + "schemaVersion": 1, + "properties": { + "documentUrl": { + "title": "Document URL", + "type": "string", + "description": "URL of the document to process with Docling. Supported formats: images, 'pdf', 'docx', 'pptx', 'xlsx, 'html', 'md', 'xml_pubmed', 'asciidoc', 'xml_uspto'.", + "prefill": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf", + "editor": "textfield" + }, + "outputFormat": { + "title": "Output Format", + "type": "string", + "description": "Specifies the desired output format after processing the document. Supported formats: 'md', 'json', 'html', 'text', 'doctags'.", + "enum": ["md", "json", "html", "text", "doctags"], + "default": "md", + "editor": "select" + }, + "ocr": { + "title": "Enable OCR", + "type": "boolean", + "description": "If true, OCR will be applied to scanned PDFs for text recognition.", + "default": true + } + }, + "required": ["documentUrl"] +}