From 5f5c0a9d50ca02c0740468b7f253752e455dc656 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=A1clav=20Van=C4=8Dura?= <commit@vancura.dev>
Date: Sun, 9 Mar 2025 15:51:39 +0100
Subject: [PATCH] Actor: Refactor `actor.sh` and add `docling_processor.py`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor the `actor.sh` script to modularize functions for finding the Apify CLI, setting up a temporary environment, and cleaning it up. Introduce a new function, `get_actor_input()`, to handle input detection more robustly. Replace inline Python conversion logic with an external script, `docling_processor.py`, for processing documents via the docling-serve API.

Signed-off-by: Václav Vančura <commit@vancura.dev>
---
 .actor/Dockerfile           |   1 +
 .actor/actor.sh             | 627 ++++++++----------------------------
 .actor/docling_processor.py | 376 +++++++++++++++++++++
 3 files changed, 518 insertions(+), 486 deletions(-)
 create mode 100755 .actor/docling_processor.py

diff --git a/.actor/Dockerfile b/.actor/Dockerfile
index 18462815..7c4615c1 100644
--- a/.actor/Dockerfile
+++ b/.actor/Dockerfile
@@ -62,6 +62,7 @@ ENV EASYOCR_MODULE_PATH=/tmp/easyocr-models
 COPY --chown=1000:1000 .actor/actor.sh .actor/actor.sh
 COPY --chown=1000:1000 .actor/actor.json .actor/actor.json
 COPY --chown=1000:1000 .actor/input_schema.json .actor/input_schema.json
+COPY --chown=1000:1000 .actor/docling_processor.py .actor/docling_processor.py
 RUN chmod +x .actor/actor.sh
 
 # Copy the build files from builder
diff --git a/.actor/actor.sh b/.actor/actor.sh
index c667d31c..a3ea5616 100755
--- a/.actor/actor.sh
+++ b/.actor/actor.sh
@@ -8,25 +8,14 @@ upload_to_kvs() {
     local description="$4"
 
     # Find the Apify CLI command
-    local apify_cmd=""
-    for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do
-        if command -v "$cmd" &> /dev/null; then
-            apify_cmd="$cmd"
-            break
-        fi
-    done
+    find_apify_cmd
+    local apify_cmd="$FOUND_APIFY_CMD"
 
     if [ -n "$apify_cmd" ]; then
         echo "Uploading $description to key-value store (key: $key_name)..."
 
         # Create a temporary home directory with write permissions
-        export TMPDIR="/tmp/apify-home-${RANDOM}"
-        mkdir -p "$TMPDIR"
-
-        # Multiple strategies to disable version checking
-        export APIFY_DISABLE_VERSION_CHECK=1
-        export NODE_OPTIONS="--no-warnings"
-        export HOME="$TMPDIR"  # Override home directory to writable location
+        setup_temp_environment
 
         # Use the --no-update-notifier flag if available
         if $apify_cmd --help | grep -q "\--no-update-notifier"; then
@@ -34,7 +23,7 @@ upload_to_kvs() {
                 echo "Successfully uploaded $description to key-value store"
                 local url="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/$key_name"
                 echo "$description available at: $url"
-                rm -rf "$TMPDIR" 2>/dev/null || true  # Clean up temp dir
+                cleanup_temp_environment
                 return 0
             fi
         else
@@ -43,13 +32,13 @@ upload_to_kvs() {
                 echo "Successfully uploaded $description to key-value store"
                 local url="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/$key_name"
                 echo "$description available at: $url"
-                rm -rf "$TMPDIR" 2>/dev/null || true  # Clean up temp dir
+                cleanup_temp_environment
                 return 0
             fi
         fi
 
         echo "ERROR: Failed to upload $description to key-value store"
-        rm -rf "$TMPDIR" 2>/dev/null || true  # Clean up temp dir
+        cleanup_temp_environment
         return 1
     else
         echo "ERROR: Apify CLI not found for $description upload"
@@ -57,6 +46,64 @@ upload_to_kvs() {
     fi
 }
 
+# Function to find Apify CLI command
+find_apify_cmd() {
+    FOUND_APIFY_CMD=""
+    for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do
+        if command -v "$cmd" &> /dev/null; then
+            FOUND_APIFY_CMD="$cmd"
+            break
+        fi
+    done
+}
+
+# Function to set up temporary environment for Apify CLI
+setup_temp_environment() {
+    export TMPDIR="/tmp/apify-home-${RANDOM}"
+    mkdir -p "$TMPDIR"
+    export APIFY_DISABLE_VERSION_CHECK=1
+    export NODE_OPTIONS="--no-warnings"
+    export HOME="$TMPDIR"  # Override home directory to writable location
+}
+
+# Function to clean up temporary environment
+cleanup_temp_environment() {
+    rm -rf "$TMPDIR" 2>/dev/null || true
+}
+
+# Function to push data to Apify dataset
+push_to_dataset() {
+    local document_url="$1"
+    local result_url="$2"
+
+    # Find Apify CLI command
+    find_apify_cmd
+    local apify_cmd="$FOUND_APIFY_CMD"
+
+    if [ -n "$apify_cmd" ]; then
+        echo "Adding record to dataset..."
+        setup_temp_environment
+
+        # Use the --no-update-notifier flag if available
+        if $apify_cmd --help | grep -q "\--no-update-notifier"; then
+            if $apify_cmd --no-update-notifier actor:push-data "{\"url\": \"${document_url}\", \"output_file\": \"${result_url}\", \"status\": \"success\"}"; then
+                echo "Successfully added record to dataset"
+            else
+                echo "Warning: Failed to add record to dataset"
+            fi
+        else
+            # Fall back to regular command
+            if $apify_cmd actor:push-data "{\"url\": \"${document_url}\", \"output_file\": \"${result_url}\", \"status\": \"success\"}"; then
+                echo "Successfully added record to dataset"
+            else
+                echo "Warning: Failed to add record to dataset"
+            fi
+        fi
+
+        cleanup_temp_environment
+    fi
+}
+
 
 # --- Setup logging and error handling ---
 
@@ -98,6 +145,17 @@ else
     echo "Warning: No build files directory found. Some tools may be unavailable."
 fi
 
+# Copy Python processor script to tools directory
+PYTHON_SCRIPT_PATH="$(dirname "$0")/docling_processor.py"
+if [ -f "$PYTHON_SCRIPT_PATH" ]; then
+    echo "Copying Python processor script to tools directory..."
+    cp "$PYTHON_SCRIPT_PATH" "$TOOLS_DIR/"
+    chmod +x "$TOOLS_DIR/docling_processor.py"
+else
+    echo "ERROR: Python processor script not found at $PYTHON_SCRIPT_PATH"
+    exit 1
+fi
+
 # Check OCR directories and ensure they're writable
 echo "Checking OCR directory permissions..."
 OCR_DIR="/opt/app-root/src/.EasyOCR"
@@ -108,6 +166,7 @@ if [ -d "$OCR_DIR" ]; then
         rm "$OCR_DIR/test_write"
     else
         echo "[✗] OCR directory is not writable, setting up alternative in /tmp"
+
         # Create alternative in /tmp (which is writable)
         mkdir -p "/tmp/.EasyOCR/user_network"
         export EASYOCR_MODULE_PATH="/tmp/.EasyOCR"
@@ -224,41 +283,52 @@ DOCLING_API_ENDPOINT="http://localhost:5001/v1alpha/convert/source"
 echo "Starting document processing..."
 echo "Reading input from Apify..."
 
-INPUT=""
+# Function to handle Actor input detection
+get_actor_input() {
+    local input=""
 
-# Create directory if it doesn't exist
-mkdir -p "/tmp/actor-input" || echo "Warning: Could not create /tmp/actor-input directory"
+    # Create directory if it doesn't exist
+    mkdir -p "/tmp/actor-input" || echo "Warning: Could not create /tmp/actor-input directory" >&2
 
-# List all possible input locations for debugging
-echo "Listing potential input file locations:"
-ls -la "/tmp/actor-input/" 2>/dev/null || echo "Cannot list /tmp/actor-input/"
-ls -la "/input/" 2>/dev/null || echo "Cannot list /input/"
+    # If /tmp/actor-input/INPUT exists as a directory, remove it
+    if [ -d "/tmp/actor-input/INPUT" ]; then
+        echo "Warning: /tmp/actor-input/INPUT exists as a directory. Removing it to create a file." >&2
+        rm -rf "/tmp/actor-input/INPUT"
+    fi
 
-# Check multiple potential locations for input file
-if [ -f "/tmp/actor-input/INPUT" ]; then
-    echo "Found standard Actor input file at /tmp/actor-input/INPUT"
-    echo "Content:"
-    cat "/tmp/actor-input/INPUT"
-    INPUT=$(cat "/tmp/actor-input/INPUT")
-elif [ -f "/input/INPUT" ]; then
-    echo "Found Actor input file at /input/INPUT"
-    echo "Content:"
-    cat "/input/INPUT"
-    INPUT=$(cat "/input/INPUT")
-# Fallback to environment variable
-elif [ -n "$APIFY_INPUT_JSON" ]; then
-    echo "Using APIFY_INPUT_JSON environment variable"
-    INPUT="$APIFY_INPUT_JSON"
-# Last resort: use test input - now defaulting to md as requested
-else
-    echo "No input found, using test input with md format"
-    TEST_INPUT='{"documentUrl":"https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf","ocr":true,"outputFormat":"md"}'
-    mkdir -p "/tmp/actor-input"
-    echo "$TEST_INPUT" > "/tmp/actor-input/INPUT"
-    INPUT="$TEST_INPUT"
-fi
+    # Check multiple potential locations for input file
+    if [ -f "/tmp/actor-input/INPUT" ]; then
+        echo "Found standard Actor input file at /tmp/actor-input/INPUT" >&2
+        input=$(cat "/tmp/actor-input/INPUT")
+    elif [ -f "/input/INPUT" ]; then
+        echo "Found Actor input file at /input/INPUT" >&2
+        input=$(cat "/input/INPUT")
 
-echo "Input content: $INPUT"
+    # Fallback to environment variable
+    elif [ -n "$APIFY_INPUT_JSON" ]; then
+        echo "Using APIFY_INPUT_JSON environment variable" >&2
+        input="$APIFY_INPUT_JSON"
+
+    # Last resort: use test input with md format
+    else
+        echo "No input found, using test input with md format" >&2
+        TEST_INPUT='{"documentUrl":"https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf","ocr":true,"outputFormat":"md"}'
+        mkdir -p "/tmp/actor-input"
+        echo "$TEST_INPUT" > "/tmp/actor-input/INPUT"
+
+        # Read back the test input to ensure we get clean JSON
+        input=$(cat "/tmp/actor-input/INPUT")
+    fi
+
+    # Return only the JSON content
+    echo "$input"
+}
+
+# Get actor input
+INPUT=$(get_actor_input)
+echo "Input content:" >&2
+echo "$INPUT" >&2  # Send the raw input to stderr for debugging
+echo "$INPUT"      # Send the clean JSON to stdout for processing
 
 # Extract values from INPUT using Python
 echo "Using Python to parse input..."
@@ -271,13 +341,17 @@ if [ -z "$DOCUMENT_URL" ]; then
     echo "ERROR: No document URL provided in input"
 
     # Try to push data to Actor but don't exit if it fails
-    if command -v actor &> /dev/null; then
+    find_apify_cmd
+    apify_cmd="$FOUND_APIFY_CMD"
+    if [ -n "$apify_cmd" ]; then
         echo "Reporting missing document URL to Actor storage..."
-        if actor push-data "{\"status\": \"error\", \"error\": \"No document URL provided in input\"}" 2>&1; then
+        setup_temp_environment
+        if $apify_cmd actor:push-data "{\"status\": \"error\", \"error\": \"No document URL provided in input\"}" 2>&1; then
             echo "Successfully pushed error message to Actor storage"
         else
             echo "Warning: Failed to push error message to Actor storage"
         fi
+        cleanup_temp_environment
     fi
 
     # Use default document URL for testing instead of exiting
@@ -290,410 +364,29 @@ if [ -z "$OUTPUT_FORMAT" ]; then
     OUTPUT_FORMAT="md"
 fi
 
+# Ensure OCR_ENABLED has a valid boolean value
+if [ -z "$OCR_ENABLED" ]; then
+    echo "No OCR setting specified, defaulting to true"
+    OCR_ENABLED="true"
+fi
+
 echo "Input values: documentUrl=$DOCUMENT_URL, outputFormat=$OUTPUT_FORMAT, ocr=$OCR_ENABLED"
 
 # Create the request JSON
 REQUEST_JSON="{\"options\":{\"to_formats\":[\"$OUTPUT_FORMAT\"],\"ocr\":$OCR_ENABLED},\"http_sources\":[{\"url\":\"$DOCUMENT_URL\"}]}"
+echo "Creating request JSON:" >&2
+echo "$REQUEST_JSON" >&2
 echo "$REQUEST_JSON" > "$API_DIR/request.json"
 
-# Send the conversion request
+# Send the conversion request using our Python script
 echo "Sending conversion request to docling-serve API..."
-python -c "
-import json
-import time
-import sys
-import os
-import traceback
-
-try:
-    # Load request data from temporary location
-    with open('$API_DIR/request.json', 'r') as f:
-        request_data = json.load(f)
-
-    print(f'Request to convert URL: {request_data[\"http_sources\"][0][\"url\"]}')
-    print(f'Output format: {request_data[\"options\"][\"to_formats\"][0]}')
-    print(f'OCR enabled: {request_data[\"options\"][\"ocr\"]}')
-
-    # Try requests first, fall back to urllib
-    try:
-        import requests
-        print('Using requests library for API call')
-
-        # Record start time for timing
-        start_time = time.time()
-        print(f'Starting conversion request at {time.strftime(\"%H:%M:%S\")}')
-
-        response = requests.post(
-            '$DOCLING_API_ENDPOINT',
-            json=request_data,
-            timeout=300  # 5 minutes timeout
-        )
-
-        elapsed = time.time() - start_time
-        print(f'Conversion request completed in {elapsed:.2f} seconds')
-        print(f'Response status code: {response.status_code}')
-
-        # Save the full response for debugging
-        with open('$API_DIR/full_response.txt', 'w') as f:
-            f.write(f'Status code: {response.status_code}\\n')
-            f.write(f'Headers: {response.headers}\\n\\n')
-            f.write(f'Content: {response.text[:10000]}...' if len(response.text) > 10000 else f'Content: {response.text}')
-
-        if response.status_code == 200:
-            with open('$API_DIR/response.json', 'w') as f:
-                f.write(response.text)
-
-            # Parse the response even if it's not valid JSON
-            try:
-                resp_data = response.json()
-                print('Successfully parsed response as JSON')
-
-                # Save detailed diagnostics about the response structure
-                with open('$API_DIR/response_structure.txt', 'w') as f:
-                    f.write(f'Response keys: {list(resp_data.keys())}\\n')
-                    if 'document' in resp_data:
-                        f.write(f'Document keys: {list(resp_data[\"document\"].keys() if resp_data[\"document\"] else [])}\\n')
-
-                        # Check for specific content fields with null safety
-                        doc = resp_data['document'] or {}
-                        if 'html_content' in doc and doc['html_content']:
-                            f.write(f'HTML content length: {len(doc[\"html_content\"])}\\n')
-                        elif 'html_content' in doc:
-                            f.write('HTML content is present but empty or null\\n')
-
-                        if 'md_content' in doc and doc['md_content']:
-                            f.write(f'Markdown content length: {len(doc[\"md_content\"])}\\n')
-                        elif 'md_content' in doc:
-                            f.write('Markdown content is present but empty or null\\n')
-
-                        if 'text_content' in doc and doc['text_content']:
-                            f.write(f'Text content length: {len(doc[\"text_content\"])}\\n')
-                        elif 'text_content' in doc:
-                            f.write('Text content is present but empty or null\\n')
-
-                        if 'json_content' in doc and doc['json_content']:
-                            f.write(f'JSON content length: {len(doc[\"json_content\"])}\\n')
-                        elif 'json_content' in doc:
-                            f.write('JSON content is present but empty or null\\n')
-
-                    if 'outputs' in resp_data:
-                        f.write(f'Outputs count: {len(resp_data[\"outputs\"])}\\n')
-                        if resp_data['outputs']:
-                            f.write(f'First output keys: {list(resp_data[\"outputs\"][0].keys())}\\n')
-                            if 'files' in resp_data['outputs'][0]:
-                                f.write(f'Files count: {len(resp_data[\"outputs\"][0][\"files\"])}\\n')
-                                if resp_data['outputs'][0]['files']:
-                                    f.write(f'First file keys: {list(resp_data[\"outputs\"][0][\"files\"][0].keys())}\\n')
-                                    if 'content' in resp_data['outputs'][0]['files'][0]:
-                                        content_length = len(resp_data['outputs'][0]['files'][0]['content'])
-                                        f.write(f'Content length: {content_length}\\n')
-
-                # Process the response - check for outputs and files
-                if 'outputs' in resp_data and resp_data['outputs']:
-                    output = resp_data['outputs'][0]
-                    print(f'Found {len(resp_data[\"outputs\"])} outputs in response')
-
-                    if 'files' in output and output['files']:
-                        file_data = output['files'][0]
-                        print(f'Found {len(output[\"files\"])} files in output')
-
-                        if 'content' in file_data and file_data['content']:
-                            print(f'Found content in file (length: {len(file_data[\"content\"])})')
-                            with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f:
-                                f.write(file_data['content'])
-                            print('CONVERSION SUCCESS')
-                            sys.exit(0)
-                        else:
-                            if 'content' in file_data:
-                                print('Content field exists but is empty')
-                            else:
-                                print('No content field in file data')
-                                print(f'Available fields: {list(file_data.keys())}')
-                    else:
-                        print('No files found in output')
-                        print(f'Available fields: {list(output.keys())}')
-
-                # Alternative response format check - document field
-                elif 'document' in resp_data and resp_data['status'] == 'success':
-                    print('Found alternative response format with document field')
-                    document = resp_data['document'] or {}
-
-                    # Check format fields in document to see what's available
-                    available_formats = []
-                    if 'html_content' in document and document['html_content']:
-                        available_formats.append(('html', document['html_content']))
-                    if 'md_content' in document and document['md_content']:
-                        available_formats.append(('md', document['md_content']))
-                    if 'text_content' in document and document['text_content']:
-                        available_formats.append(('text', document['text_content']))
-                    if 'json_content' in document and document['json_content']:
-                        available_formats.append(('json', document['json_content']))
-
-                    if available_formats:
-                        print(f'Found {len(available_formats)} available formats: {[f[0] for f in available_formats]}')
-
-                        # First try to find the exact requested format
-                        requested_format_match = next((f for f in available_formats if f[0] == '$OUTPUT_FORMAT'.lower()), None)
-
-                        if requested_format_match:
-                            format_type, content = requested_format_match
-                            print(f'Found content in requested format {format_type} (length: {len(content)})')
-                        else:
-                            # If requested format not found, use the first available
-                            format_type, content = available_formats[0]
-                            print(f'Requested format not found, using alternative format {format_type} (length: {len(content)})')
-
-                        # Save the content to the output file with appropriate extension
-                        with open(f'$API_DIR/output.{format_type}', 'w') as f:
-                            f.write(content)
-
-                        # If we're using a different format than requested, also save with requested extension
-                        if format_type != '$OUTPUT_FORMAT'.lower():
-                            print(f'Saving content with requested extension {format_type} -> $OUTPUT_FORMAT')
-                            with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f:
-                                f.write(content)
-
-                        print('CONVERSION SUCCESS')
-                        sys.exit(0)
-                    else:
-                        # No content fields found or all are empty
-                        # Check if fields exist but are empty or null
-                        empty_fields = []
-                        if 'html_content' in document and not document['html_content']:
-                            empty_fields.append('html_content')
-                        if 'md_content' in document and not document['md_content']:
-                            empty_fields.append('md_content')
-                        if 'text_content' in document and not document['text_content']:
-                            empty_fields.append('text_content')
-
-                        if empty_fields:
-                            print(f'Found content fields but they are empty or null: {empty_fields}')
-                        else:
-                            print('No content fields found in document')
-
-                        print(f'Available fields in document: {list(document.keys() if document else [])}')
-                else:
-                    print('No outputs found in response')
-                    print(f'Available fields: {list(resp_data.keys())}')
-
-                # Try to extract any alternate formats or metadata
-                if 'metadata' in resp_data:
-                    print('Metadata found in response, saving to file')
-                    with open('$API_DIR/metadata.json', 'w') as f:
-                        json.dump(resp_data['metadata'], f, indent=2)
-
-                print('CONVERSION PARTIAL - Some data available but not complete')
-            except Exception as json_error:
-                print(f'Failed to parse response as JSON: {json_error}')
-                traceback.print_exc()
-
-                # Save raw content as text if JSON parsing fails
-                with open('$API_DIR/output.txt', 'w') as f:
-                    f.write(response.text)
-                print('Saved raw response as text file')
-                print('CONVERSION PARTIAL - Raw response saved')
-        else:
-            print(f'Error response: {response.text[:500]}')
-            print('CONVERSION FAILED')
-
-    except ImportError:
-        # Fall back to urllib
-        import urllib.request
-        import urllib.error
-
-        print('Using urllib library for API call')
-        headers = {'Content-Type': 'application/json'}
-        req_data = json.dumps(request_data).encode('utf-8')
-
-        req = urllib.request.Request(
-            '$DOCLING_API_ENDPOINT',
-            data=req_data,
-            headers=headers,
-            method='POST'
-        )
-
-        try:
-            start_time = time.time()
-            print(f'Starting conversion request at {time.strftime(\"%H:%M:%S\")}')
-
-            with urllib.request.urlopen(req, timeout=300) as response:
-                elapsed = time.time() - start_time
-                print(f'Conversion request completed in {elapsed:.2f} seconds')
-                print(f'Response status: {response.status}')
-
-                if response.status == 200:
-                    response_text = response.read().decode('utf-8')
-
-                    # Save full response for debugging
-                    with open('$API_DIR/full_response.txt', 'w') as f:
-                        f.write(f'Status: {response.status}\\n')
-                        f.write(f'Headers: {response.headers}\\n\\n')
-                        f.write(f'Content: {response_text[:10000]}...' if len(response_text) > 10000 else f'Content: {response_text}')
-
-                    with open('$API_DIR/response.json', 'w') as f:
-                        f.write(response_text)
-
-                    try:
-                        resp_data = json.loads(response_text)
-                        print('Successfully parsed response as JSON')
-
-                        # Save detailed diagnostics about the response structure
-                        with open('$API_DIR/response_structure.txt', 'w') as f:
-                            f.write(f'Response keys: {list(resp_data.keys())}\\n')
-                            if 'document' in resp_data:
-                                f.write(f'Document keys: {list(resp_data[\"document\"].keys() if resp_data[\"document\"] else [])}\\n')
-
-                                # Check for specific content fields with null safety
-                                doc = resp_data['document'] or {}
-                                if 'html_content' in doc and doc['html_content']:
-                                    f.write(f'HTML content length: {len(doc[\"html_content\"])}\\n')
-                                elif 'html_content' in doc:
-                                    f.write('HTML content is present but empty or null\\n')
-
-                                if 'md_content' in doc and doc['md_content']:
-                                    f.write(f'Markdown content length: {len(doc[\"md_content\"])}\\n')
-                                elif 'md_content' in doc:
-                                    f.write('Markdown content is present but empty or null\\n')
-
-                                if 'text_content' in doc and doc['text_content']:
-                                    f.write(f'Text content length: {len(doc[\"text_content\"])}\\n')
-                                elif 'text_content' in doc:
-                                    f.write('Text content is present but empty or null\\n')
-
-                                if 'json_content' in doc and doc['json_content']:
-                                    f.write(f'JSON content length: {len(doc[\"json_content\"])}\\n')
-                                elif 'json_content' in doc:
-                                    f.write('JSON content is present but empty or null\\n')
-
-                            if 'outputs' in resp_data:
-                                f.write(f'Outputs count: {len(resp_data[\"outputs\"])}\\n')
-                                if resp_data['outputs']:
-                                    f.write(f'First output keys: {list(resp_data[\"outputs\"][0].keys())}\\n')
-                                    if 'files' in resp_data['outputs'][0]:
-                                        f.write(f'Files count: {len(resp_data[\"outputs\"][0][\"files\"])}\\n')
-                                        if resp_data['outputs'][0]['files']:
-                                            f.write(f'First file keys: {list(resp_data[\"outputs\"][0][\"files\"][0].keys())}\\n')
-                                            if 'content' in resp_data['outputs'][0]['files'][0]:
-                                                content_length = len(resp_data['outputs'][0]['files'][0]['content'])
-                                                f.write(f'Content length: {content_length}\\n')
-
-                        if 'outputs' in resp_data and resp_data['outputs']:
-                            output = resp_data['outputs'][0]
-                            print(f'Found {len(resp_data[\"outputs\"])} outputs in response')
-
-                            if 'files' in output and output['files']:
-                                file_data = output['files'][0]
-                                print(f'Found {len(output[\"files\"])} files in output')
-
-                                if 'content' in file_data and file_data['content']:
-                                    print(f'Found content in file (length: {len(file_data[\"content\"])})')
-                                    with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f:
-                                        f.write(file_data['content'])
-                                    print('CONVERSION SUCCESS')
-                                    sys.exit(0)
-                                else:
-                                    if 'content' in file_data:
-                                        print('Content field exists but is empty')
-                                    else:
-                                        print('No content field in file data')
-                                        print(f'Available fields: {list(file_data.keys())}')
-                            else:
-                                print('No files found in output')
-                                print(f'Available fields: {list(output.keys())}')
-
-                        # Alternative response format check - document field
-                        elif 'document' in resp_data and resp_data['status'] == 'success':
-                            print('Found alternative response format with document field')
-                            document = resp_data['document'] or {}
-
-                            # Check format fields in document to see what's available
-                            available_formats = []
-                            if 'html_content' in document and document['html_content']:
-                                available_formats.append(('html', document['html_content']))
-                            if 'md_content' in document and document['md_content']:
-                                available_formats.append(('md', document['md_content']))
-                            if 'text_content' in document and document['text_content']:
-                                available_formats.append(('text', document['text_content']))
-                            if 'json_content' in document and document['json_content']:
-                                available_formats.append(('json', document['json_content']))
-
-                            if available_formats:
-                                print(f'Found {len(available_formats)} available formats: {[f[0] for f in available_formats]}')
-
-                                # First try to find the exact requested format
-                                requested_format_match = next((f for f in available_formats if f[0] == '$OUTPUT_FORMAT'.lower()), None)
-
-                                if requested_format_match:
-                                    format_type, content = requested_format_match
-                                    print(f'Found content in requested format {format_type} (length: {len(content)})')
-                                else:
-                                    # If requested format not found, use the first available
-                                    format_type, content = available_formats[0]
-                                    print(f'Requested format not found, using alternative format {format_type} (length: {len(content)})')
-
-                                # Save the content to the output file with appropriate extension
-                                with open(f'$API_DIR/output.{format_type}', 'w') as f:
-                                    f.write(content)
-
-                                # If we're using a different format than requested, also save with requested extension
-                                if format_type != '$OUTPUT_FORMAT'.lower():
-                                    print(f'Saving content with requested extension {format_type} -> $OUTPUT_FORMAT')
-                                    with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f:
-                                        f.write(content)
-
-                                print('CONVERSION SUCCESS')
-                                sys.exit(0)
-                            else:
-                                # No content fields found or all are empty
-                                # Check if fields exist but are empty or null
-                                empty_fields = []
-                                if 'html_content' in document and not document['html_content']:
-                                    empty_fields.append('html_content')
-                                if 'md_content' in document and not document['md_content']:
-                                    empty_fields.append('md_content')
-                                if 'text_content' in document and not document['text_content']:
-                                    empty_fields.append('text_content')
-
-                                if empty_fields:
-                                    print(f'Found content fields but they are empty or null: {empty_fields}')
-                                else:
-                                    print('No content fields found in document')
-
-                                print(f'Available fields in document: {list(document.keys() if document else [])}')
-                        else:
-                            print('No outputs found in response')
-                            print(f'Available fields: {list(resp_data.keys())}')
-
-                        print('CONVERSION PARTIAL - Some data available but not complete')
-                    except Exception as json_error:
-                        print(f'Failed to parse response as JSON: {json_error}')
-                        traceback.print_exc()
-
-                        # Save raw content as text if JSON parsing fails
-                        with open('$API_DIR/output.txt', 'w') as f:
-                            f.write(response_text)
-                        print('Saved raw response as text file')
-                        print('CONVERSION PARTIAL - Raw response saved')
-                else:
-                    print(f'Error status: {response.status}')
-                    print('CONVERSION FAILED')
-        except urllib.error.HTTPError as e:
-            print(f'HTTP Error: {e.code} - {e.reason}')
-            print(f'Response body: {e.read().decode(\"utf-8\")[:500]}')
-            print('CONVERSION FAILED')
-        except urllib.error.URLError as e:
-            print(f'URL Error: {e.reason}')
-            print('CONVERSION FAILED')
-        except Exception as e:
-            print(f'Unexpected error during urllib request: {e}')
-            traceback.print_exc()
-            print('CONVERSION FAILED')
-except Exception as e:
-    print(f'Error during conversion: {e}')
-    traceback.print_exc()
-    print('CONVERSION FAILED')
-" 2>&1
+python "$TOOLS_DIR/docling_processor.py" \
+    --api-endpoint "$DOCLING_API_ENDPOINT" \
+    --request-json "$API_DIR/request.json" \
+    --output-dir "$API_DIR" \
+    --output-format "$OUTPUT_FORMAT"
 
+PYTHON_EXIT_CODE=$?
 
 # --- Check for various potential output files ---
 
@@ -732,46 +425,8 @@ if [ -f "$API_DIR/output.$OUTPUT_FORMAT" ]; then
         echo "Your document is available at: ${RESULT_URL}"
         echo "=============================="
 
-        # Find the Apify CLI again (reusing the function's logic would be better, but for clarity we'll repeat)
-        APIFY_CMD=""
-        for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do
-            if command -v "$cmd" &> /dev/null; then
-                APIFY_CMD="$cmd"
-                break
-            fi
-        done
-
-        if [ -n "$APIFY_CMD" ]; then
-            # Add record to dataset with enhanced version check prevention
-            echo "Adding record to dataset..."
-
-            # Create a temporary home directory with write permissions
-            export TMPDIR="/tmp/apify-home-${RANDOM}"
-            mkdir -p "$TMPDIR"
-
-            # Multiple strategies to disable version checking
-            export APIFY_DISABLE_VERSION_CHECK=1
-            export NODE_OPTIONS="--no-warnings"
-            export HOME="$TMPDIR"  # Override home directory to writable location
-
-            # Use the --no-update-notifier flag if available
-            if $APIFY_CMD --help | grep -q "\--no-update-notifier"; then
-                if $APIFY_CMD --no-update-notifier actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"output_file\": \"${RESULT_URL}\", \"status\": \"success\"}"; then
-                    echo "Successfully added record to dataset"
-                else
-                    echo "Warning: Failed to add record to dataset"
-                fi
-            else
-                # Fall back to regular command
-                if $APIFY_CMD actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"output_file\": \"${RESULT_URL}\", \"status\": \"success\"}"; then
-                    echo "Successfully added record to dataset"
-                else
-                    echo "Warning: Failed to add record to dataset"
-                fi
-            fi
-
-            rm -rf "$TMPDIR" 2>/dev/null || true  # Clean up temp dir
-        fi
+        # Push data to dataset
+        push_to_dataset "$DOCUMENT_URL" "$RESULT_URL"
     fi
 else
     echo "ERROR: No converted output file found at $API_DIR/output.$OUTPUT_FORMAT"
diff --git a/.actor/docling_processor.py b/.actor/docling_processor.py
new file mode 100755
index 00000000..12f10712
--- /dev/null
+++ b/.actor/docling_processor.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+"""
+Document Processing Script for Docling-Serve API
+
+This script handles the communication with the docling-serve API,
+processes the conversion request, and saves the output to the specified location.
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+import traceback
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+# Global constants
+DEFAULT_TIMEOUT = 300  # 5 minutes
+OUTPUT_FORMATS = ["md", "html", "json", "text"]
+
+
+def setup_arg_parser() -> argparse.ArgumentParser:
+    """Set up command line argument parser."""
+    parser = argparse.ArgumentParser(description="Process documents using docling-serve API")
+    parser.add_argument("--api-endpoint", required=True, help="Docling API endpoint URL")
+    parser.add_argument("--request-json", required=True, help="Path to JSON file with request data")
+    parser.add_argument("--output-dir", required=True, help="Directory to save output files")
+    parser.add_argument("--output-format", required=True, help="Desired output format")
+    parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Request timeout in seconds")
+    return parser
+
+
+def load_request_data(json_path: str) -> Dict:
+    """Load request data from JSON file."""
+    try:
+        with open(json_path, 'r') as f:
+            return json.load(f)
+    except (json.JSONDecodeError, FileNotFoundError) as e:
+        print(f"Error loading request data: {e}")
+        sys.exit(1)
+
+
+def save_response_diagnostics(response_text: str, status_code: int, headers: Dict, output_dir: str) -> None:
+    """Save full response and headers for debugging."""
+    with open(os.path.join(output_dir, "full_response.txt"), 'w') as f:
+        f.write(f"Status code: {status_code}\n")
+        f.write(f"Headers: {headers}\n\n")
+
+        # Truncate very long responses
+        if len(response_text) > 10000:
+            f.write(f"Content: {response_text[:10000]}...")
+        else:
+            f.write(f"Content: {response_text}")
+
+
+def save_response_json(response_text: str, output_dir: str) -> None:
+    """Save raw response JSON."""
+    with open(os.path.join(output_dir, "response.json"), 'w') as f:
+        f.write(response_text)
+
+
+def save_structure_info(data: Dict, output_dir: str) -> None:
+    """Save detailed information about response structure."""
+    with open(os.path.join(output_dir, "response_structure.txt"), 'w') as f:
+        f.write(f'Response keys: {list(data.keys())}\n')
+
+        # Document content details
+        if 'document' in data:
+            doc = data['document'] or {}
+            f.write(f'Document keys: {list(doc.keys() if doc else [])}\n')
+
+            # Check specific content fields
+            for content_type in ['html_content', 'md_content', 'text_content', 'json_content']:
+                if content_type in doc and doc[content_type]:
+                    f.write(f'{content_type.replace("_content", "").upper()} content length: {len(doc[content_type])}\n')
+                elif content_type in doc:
+                    f.write(f'{content_type.replace("_content", "").upper()} content is present but empty or null\n')
+
+        # Output structure details
+        if 'outputs' in data:
+            f.write(f'Outputs count: {len(data["outputs"])}\n')
+            if data['outputs']:
+                output = data['outputs'][0]
+                f.write(f'First output keys: {list(output.keys())}\n')
+
+                if 'files' in output:
+                    f.write(f'Files count: {len(output["files"])}\n')
+                    if output['files']:
+                        file_data = output['files'][0]
+                        f.write(f'First file keys: {list(file_data.keys())}\n')
+                        if 'content' in file_data:
+                            content_length = len(file_data['content'])
+                            f.write(f'Content length: {content_length}\n')
+
+
+def extract_content_from_file_output(data: Dict, output_format: str, output_dir: str) -> bool:
+    """Extract content from 'files' output format."""
+    if 'outputs' not in data or not data['outputs']:
+        print('No outputs found in response')
+        return False
+
+    output = data['outputs'][0]
+    if 'files' not in output or not output['files']:
+        print('No files found in output')
+        print(f'Available fields: {list(output.keys())}')
+        return False
+
+    file_data = output['files'][0]
+    if 'content' not in file_data or not file_data['content']:
+        if 'content' in file_data:
+            print('Content field exists but is empty')
+        else:
+            print('No content field in file data')
+            print(f'Available fields: {list(file_data.keys())}')
+        return False
+
+    # Content found, save it
+    content = file_data['content']
+    print(f'Found content in file (length: {len(content)})')
+    with open(os.path.join(output_dir, f"output.{output_format}"), 'w') as f:
+        f.write(content)
+    print('CONVERSION SUCCESS')
+    return True
+
+
+def extract_content_from_document(data: Dict, output_format: str, output_dir: str) -> bool:
+    """Extract content from 'document' response format."""
+    if 'document' not in data or data.get('status') != 'success':
+        print('No document field or success status found in response')
+        return False
+
+    document = data['document'] or {}
+
+    # Check available formats
+    available_formats = []
+    for fmt in ['html', 'md', 'text', 'json']:
+        content_field = f'{fmt}_content'
+        if content_field in document and document[content_field]:
+            available_formats.append((fmt, document[content_field]))
+
+    if not available_formats:
+        # Check for empty fields
+        empty_fields = []
+        for fmt in ['html', 'md', 'text', 'json']:
+            content_field = f'{fmt}_content'
+            if content_field in document and not document[content_field]:
+                empty_fields.append(content_field)
+
+        if empty_fields:
+            print(f'Found content fields but they are empty or null: {empty_fields}')
+        else:
+            print('No content fields found in document')
+
+        print(f'Available fields in document: {list(document.keys() if document else [])}')
+        return False
+
+    # Found available formats
+    print(f'Found {len(available_formats)} available formats: {[f[0] for f in available_formats]}')
+
+    # First try to find exact requested format
+    requested_format_match = next((f for f in available_formats if f[0] == output_format.lower()), None)
+
+    if requested_format_match:
+        format_type, content = requested_format_match
+        print(f'Found content in requested format {format_type} (length: {len(content)})')
+    else:
+        # If requested format not found, use the first available
+        format_type, content = available_formats[0]
+        print(f'Requested format not found, using alternative format {format_type} (length: {len(content)})')
+
+    # Save with the matched format's extension
+    with open(os.path.join(output_dir, f"output.{format_type}"), 'w') as f:
+        f.write(content)
+
+    # If we're using a different format than requested, also save with requested extension
+    if format_type != output_format.lower():
+        print(f'Saving content with requested extension {format_type} -> {output_format}')
+        with open(os.path.join(output_dir, f"output.{output_format}"), 'w') as f:
+            f.write(content)
+
+    print('CONVERSION SUCCESS')
+    return True
+
+
+def process_success_response(response_text: str, output_format: str, output_dir: str) -> bool:
+    """Process a successful response and extract document content."""
+    try:
+        # Save raw response
+        save_response_json(response_text, output_dir)
+
+        # Parse JSON
+        data = json.loads(response_text)
+        print('Successfully parsed response as JSON')
+
+        # Save detailed structure info
+        save_structure_info(data, output_dir)
+
+        # Try both response formats
+        if extract_content_from_file_output(data, output_format, output_dir):
+            return True
+
+        if extract_content_from_document(data, output_format, output_dir):
+            return True
+
+        # Check for metadata
+        if 'metadata' in data:
+            print('Metadata found in response, saving to file')
+            with open(os.path.join(output_dir, "metadata.json"), 'w') as f:
+                json.dump(data['metadata'], f, indent=2)
+
+        print('CONVERSION PARTIAL - Some data available but not complete')
+        return False
+
+    except Exception as json_error:
+        print(f'Failed to parse response as JSON: {json_error}')
+        traceback.print_exc()
+
+        # Save raw content as text if JSON parsing fails
+        with open(os.path.join(output_dir, "output.txt"), 'w') as f:
+            f.write(response_text)
+        print('Saved raw response as text file')
+        print('CONVERSION PARTIAL - Raw response saved')
+        return False
+
+
+def process_requests_api(api_endpoint: str, request_data: Dict, output_format: str, output_dir: str, timeout: int) -> bool:
+    """Process using requests library."""
+    try:
+        import requests
+        print('Using requests library for API call')
+
+        # Record start time for timing
+        start_time = time.time()
+        print(f'Starting conversion request at {time.strftime("%H:%M:%S")}')
+
+        response = requests.post(
+            api_endpoint,
+            json=request_data,
+            timeout=timeout
+        )
+
+        elapsed = time.time() - start_time
+        print(f'Conversion request completed in {elapsed:.2f} seconds')
+        print(f'Response status code: {response.status_code}')
+
+        # Save response diagnostics
+        save_response_diagnostics(response.text, response.status_code, dict(response.headers), output_dir)
+
+        if response.status_code == 200:
+            return process_success_response(response.text, output_format, output_dir)
+        else:
+            print(f'Error response: {response.text[:500]}')
+            print('CONVERSION FAILED')
+            return False
+
+    except Exception as e:
+        print(f'Error during requests API call: {e}')
+        traceback.print_exc()
+        print('CONVERSION FAILED')
+        return False
+
+
+def process_urllib_api(api_endpoint: str, request_data: Dict, output_format: str, output_dir: str, timeout: int) -> bool:
+    """Process using urllib as fallback."""
+    try:
+        import urllib.request
+        import urllib.error
+
+        print('Using urllib library for API call')
+        headers = {'Content-Type': 'application/json'}
+        req_data = json.dumps(request_data).encode('utf-8')
+
+        req = urllib.request.Request(
+            api_endpoint,
+            data=req_data,
+            headers=headers,
+            method='POST'
+        )
+
+        try:
+            start_time = time.time()
+            print(f'Starting conversion request at {time.strftime("%H:%M:%S")}')
+
+            with urllib.request.urlopen(req, timeout=timeout) as response:
+                elapsed = time.time() - start_time
+                print(f'Conversion request completed in {elapsed:.2f} seconds')
+                print(f'Response status: {response.status}')
+
+                response_text = response.read().decode('utf-8')
+                save_response_diagnostics(response_text, response.status, dict(response.headers), output_dir)
+
+                if response.status == 200:
+                    return process_success_response(response_text, output_format, output_dir)
+                else:
+                    print(f'Error status: {response.status}')
+                    print('CONVERSION FAILED')
+                    return False
+
+        except urllib.error.HTTPError as e:
+            print(f'HTTP Error: {e.code} - {e.reason}')
+            print(f'Response body: {e.read().decode("utf-8")[:500]}')
+            print('CONVERSION FAILED')
+            return False
+
+        except urllib.error.URLError as e:
+            print(f'URL Error: {e.reason}')
+            print('CONVERSION FAILED')
+            return False
+
+        except Exception as e:
+            print(f'Unexpected error during urllib request: {e}')
+            traceback.print_exc()
+            print('CONVERSION FAILED')
+            return False
+
+    except Exception as e:
+        print(f'Error setting up urllib: {e}')
+        traceback.print_exc()
+        print('CONVERSION FAILED')
+        return False
+
+
+def process_document(api_endpoint: str, request_json_path: str, output_format: str,
+                    output_dir: str, timeout: int) -> bool:
+    """Main function to process a document through the docling-serve API."""
+    try:
+        # Ensure output directory exists
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Load request data
+        request_data = load_request_data(request_json_path)
+
+        # Log request info
+        if 'http_sources' in request_data and request_data['http_sources']:
+            print(f'Request to convert URL: {request_data["http_sources"][0]["url"]}')
+
+        if 'options' in request_data:
+            options = request_data['options']
+            if 'to_formats' in options and options['to_formats']:
+                print(f'Output format: {options["to_formats"][0]}')
+            if 'ocr' in options:
+                print(f'OCR enabled: {options["ocr"]}')
+
+        # Try requests first, fall back to urllib
+        try:
+            return process_requests_api(api_endpoint, request_data, output_format, output_dir, timeout)
+        except ImportError:
+            return process_urllib_api(api_endpoint, request_data, output_format, output_dir, timeout)
+
+    except Exception as e:
+        print(f'Error during conversion: {e}')
+        traceback.print_exc()
+        print('CONVERSION FAILED')
+        return False
+
+
+def main():
+    """Main entry point."""
+    parser = setup_arg_parser()
+    args = parser.parse_args()
+
+    success = process_document(
+        api_endpoint=args.api_endpoint,
+        request_json_path=args.request_json,
+        output_format=args.output_format,
+        output_dir=args.output_dir,
+        timeout=args.timeout
+    )
+
+    # Exit with appropriate code
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file