From 5f5c0a9d50ca02c0740468b7f253752e455dc656 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Van=C4=8Dura?= Date: Sun, 9 Mar 2025 15:51:39 +0100 Subject: [PATCH] Actor: Refactor `actor.sh` and add `docling_processor.py` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the `actor.sh` script to modularize functions for finding the Apify CLI, setting up a temporary environment, and cleaning it up. Introduce a new function, `get_actor_input()`, to handle input detection more robustly. Replace inline Python conversion logic with an external script, `docling_processor.py`, for processing documents via the docling-serve API. Signed-off-by: Václav Vančura --- .actor/Dockerfile | 1 + .actor/actor.sh | 627 ++++++++---------------------------- .actor/docling_processor.py | 376 +++++++++++++++++++++ 3 files changed, 518 insertions(+), 486 deletions(-) create mode 100755 .actor/docling_processor.py diff --git a/.actor/Dockerfile b/.actor/Dockerfile index 18462815..7c4615c1 100644 --- a/.actor/Dockerfile +++ b/.actor/Dockerfile @@ -62,6 +62,7 @@ ENV EASYOCR_MODULE_PATH=/tmp/easyocr-models COPY --chown=1000:1000 .actor/actor.sh .actor/actor.sh COPY --chown=1000:1000 .actor/actor.json .actor/actor.json COPY --chown=1000:1000 .actor/input_schema.json .actor/input_schema.json +COPY --chown=1000:1000 .actor/docling_processor.py .actor/docling_processor.py RUN chmod +x .actor/actor.sh # Copy the build files from builder diff --git a/.actor/actor.sh b/.actor/actor.sh index c667d31c..a3ea5616 100755 --- a/.actor/actor.sh +++ b/.actor/actor.sh @@ -8,25 +8,14 @@ upload_to_kvs() { local description="$4" # Find the Apify CLI command - local apify_cmd="" - for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do - if command -v "$cmd" &> /dev/null; then - apify_cmd="$cmd" - break - fi - done + find_apify_cmd + local apify_cmd="$FOUND_APIFY_CMD" if [ -n "$apify_cmd" ]; then echo "Uploading $description to key-value store (key: $key_name)..." # Create a temporary home directory with write permissions - export TMPDIR="/tmp/apify-home-${RANDOM}" - mkdir -p "$TMPDIR" - - # Multiple strategies to disable version checking - export APIFY_DISABLE_VERSION_CHECK=1 - export NODE_OPTIONS="--no-warnings" - export HOME="$TMPDIR" # Override home directory to writable location + setup_temp_environment # Use the --no-update-notifier flag if available if $apify_cmd --help | grep -q "\--no-update-notifier"; then @@ -34,7 +23,7 @@ upload_to_kvs() { echo "Successfully uploaded $description to key-value store" local url="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/$key_name" echo "$description available at: $url" - rm -rf "$TMPDIR" 2>/dev/null || true # Clean up temp dir + cleanup_temp_environment return 0 fi else @@ -43,13 +32,13 @@ upload_to_kvs() { echo "Successfully uploaded $description to key-value store" local url="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/$key_name" echo "$description available at: $url" - rm -rf "$TMPDIR" 2>/dev/null || true # Clean up temp dir + cleanup_temp_environment return 0 fi fi echo "ERROR: Failed to upload $description to key-value store" - rm -rf "$TMPDIR" 2>/dev/null || true # Clean up temp dir + cleanup_temp_environment return 1 else echo "ERROR: Apify CLI not found for $description upload" @@ -57,6 +46,64 @@ upload_to_kvs() { fi } +# Function to find Apify CLI command +find_apify_cmd() { + FOUND_APIFY_CMD="" + for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do + if command -v "$cmd" &> /dev/null; then + FOUND_APIFY_CMD="$cmd" + break + fi + done +} + +# Function to set up temporary environment for Apify CLI +setup_temp_environment() { + export TMPDIR="/tmp/apify-home-${RANDOM}" + mkdir -p "$TMPDIR" + export APIFY_DISABLE_VERSION_CHECK=1 + export NODE_OPTIONS="--no-warnings" + export HOME="$TMPDIR" # Override home directory to writable location +} + +# Function to clean up temporary environment +cleanup_temp_environment() { + rm -rf "$TMPDIR" 2>/dev/null || true +} + +# Function to push data to Apify dataset +push_to_dataset() { + local document_url="$1" + local result_url="$2" + + # Find Apify CLI command + find_apify_cmd + local apify_cmd="$FOUND_APIFY_CMD" + + if [ -n "$apify_cmd" ]; then + echo "Adding record to dataset..." + setup_temp_environment + + # Use the --no-update-notifier flag if available + if $apify_cmd --help | grep -q "\--no-update-notifier"; then + if $apify_cmd --no-update-notifier actor:push-data "{\"url\": \"${document_url}\", \"output_file\": \"${result_url}\", \"status\": \"success\"}"; then + echo "Successfully added record to dataset" + else + echo "Warning: Failed to add record to dataset" + fi + else + # Fall back to regular command + if $apify_cmd actor:push-data "{\"url\": \"${document_url}\", \"output_file\": \"${result_url}\", \"status\": \"success\"}"; then + echo "Successfully added record to dataset" + else + echo "Warning: Failed to add record to dataset" + fi + fi + + cleanup_temp_environment + fi +} + # --- Setup logging and error handling --- @@ -98,6 +145,17 @@ else echo "Warning: No build files directory found. Some tools may be unavailable." fi +# Copy Python processor script to tools directory +PYTHON_SCRIPT_PATH="$(dirname "$0")/docling_processor.py" +if [ -f "$PYTHON_SCRIPT_PATH" ]; then + echo "Copying Python processor script to tools directory..." + cp "$PYTHON_SCRIPT_PATH" "$TOOLS_DIR/" + chmod +x "$TOOLS_DIR/docling_processor.py" +else + echo "ERROR: Python processor script not found at $PYTHON_SCRIPT_PATH" + exit 1 +fi + # Check OCR directories and ensure they're writable echo "Checking OCR directory permissions..." OCR_DIR="/opt/app-root/src/.EasyOCR" @@ -108,6 +166,7 @@ if [ -d "$OCR_DIR" ]; then rm "$OCR_DIR/test_write" else echo "[✗] OCR directory is not writable, setting up alternative in /tmp" + # Create alternative in /tmp (which is writable) mkdir -p "/tmp/.EasyOCR/user_network" export EASYOCR_MODULE_PATH="/tmp/.EasyOCR" @@ -224,41 +283,52 @@ DOCLING_API_ENDPOINT="http://localhost:5001/v1alpha/convert/source" echo "Starting document processing..." echo "Reading input from Apify..." -INPUT="" +# Function to handle Actor input detection +get_actor_input() { + local input="" -# Create directory if it doesn't exist -mkdir -p "/tmp/actor-input" || echo "Warning: Could not create /tmp/actor-input directory" + # Create directory if it doesn't exist + mkdir -p "/tmp/actor-input" || echo "Warning: Could not create /tmp/actor-input directory" >&2 -# List all possible input locations for debugging -echo "Listing potential input file locations:" -ls -la "/tmp/actor-input/" 2>/dev/null || echo "Cannot list /tmp/actor-input/" -ls -la "/input/" 2>/dev/null || echo "Cannot list /input/" + # If /tmp/actor-input/INPUT exists as a directory, remove it + if [ -d "/tmp/actor-input/INPUT" ]; then + echo "Warning: /tmp/actor-input/INPUT exists as a directory. Removing it to create a file." >&2 + rm -rf "/tmp/actor-input/INPUT" + fi -# Check multiple potential locations for input file -if [ -f "/tmp/actor-input/INPUT" ]; then - echo "Found standard Actor input file at /tmp/actor-input/INPUT" - echo "Content:" - cat "/tmp/actor-input/INPUT" - INPUT=$(cat "/tmp/actor-input/INPUT") -elif [ -f "/input/INPUT" ]; then - echo "Found Actor input file at /input/INPUT" - echo "Content:" - cat "/input/INPUT" - INPUT=$(cat "/input/INPUT") -# Fallback to environment variable -elif [ -n "$APIFY_INPUT_JSON" ]; then - echo "Using APIFY_INPUT_JSON environment variable" - INPUT="$APIFY_INPUT_JSON" -# Last resort: use test input - now defaulting to md as requested -else - echo "No input found, using test input with md format" - TEST_INPUT='{"documentUrl":"https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf","ocr":true,"outputFormat":"md"}' - mkdir -p "/tmp/actor-input" - echo "$TEST_INPUT" > "/tmp/actor-input/INPUT" - INPUT="$TEST_INPUT" -fi + # Check multiple potential locations for input file + if [ -f "/tmp/actor-input/INPUT" ]; then + echo "Found standard Actor input file at /tmp/actor-input/INPUT" >&2 + input=$(cat "/tmp/actor-input/INPUT") + elif [ -f "/input/INPUT" ]; then + echo "Found Actor input file at /input/INPUT" >&2 + input=$(cat "/input/INPUT") -echo "Input content: $INPUT" + # Fallback to environment variable + elif [ -n "$APIFY_INPUT_JSON" ]; then + echo "Using APIFY_INPUT_JSON environment variable" >&2 + input="$APIFY_INPUT_JSON" + + # Last resort: use test input with md format + else + echo "No input found, using test input with md format" >&2 + TEST_INPUT='{"documentUrl":"https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf","ocr":true,"outputFormat":"md"}' + mkdir -p "/tmp/actor-input" + echo "$TEST_INPUT" > "/tmp/actor-input/INPUT" + + # Read back the test input to ensure we get clean JSON + input=$(cat "/tmp/actor-input/INPUT") + fi + + # Return only the JSON content + echo "$input" +} + +# Get actor input +INPUT=$(get_actor_input) +echo "Input content:" >&2 +echo "$INPUT" >&2 # Send the raw input to stderr for debugging +echo "$INPUT" # Send the clean JSON to stdout for processing # Extract values from INPUT using Python echo "Using Python to parse input..." @@ -271,13 +341,17 @@ if [ -z "$DOCUMENT_URL" ]; then echo "ERROR: No document URL provided in input" # Try to push data to Actor but don't exit if it fails - if command -v actor &> /dev/null; then + find_apify_cmd + apify_cmd="$FOUND_APIFY_CMD" + if [ -n "$apify_cmd" ]; then echo "Reporting missing document URL to Actor storage..." - if actor push-data "{\"status\": \"error\", \"error\": \"No document URL provided in input\"}" 2>&1; then + setup_temp_environment + if $apify_cmd actor:push-data "{\"status\": \"error\", \"error\": \"No document URL provided in input\"}" 2>&1; then echo "Successfully pushed error message to Actor storage" else echo "Warning: Failed to push error message to Actor storage" fi + cleanup_temp_environment fi # Use default document URL for testing instead of exiting @@ -290,410 +364,29 @@ if [ -z "$OUTPUT_FORMAT" ]; then OUTPUT_FORMAT="md" fi +# Ensure OCR_ENABLED has a valid boolean value +if [ -z "$OCR_ENABLED" ]; then + echo "No OCR setting specified, defaulting to true" + OCR_ENABLED="true" +fi + echo "Input values: documentUrl=$DOCUMENT_URL, outputFormat=$OUTPUT_FORMAT, ocr=$OCR_ENABLED" # Create the request JSON REQUEST_JSON="{\"options\":{\"to_formats\":[\"$OUTPUT_FORMAT\"],\"ocr\":$OCR_ENABLED},\"http_sources\":[{\"url\":\"$DOCUMENT_URL\"}]}" +echo "Creating request JSON:" >&2 +echo "$REQUEST_JSON" >&2 echo "$REQUEST_JSON" > "$API_DIR/request.json" -# Send the conversion request +# Send the conversion request using our Python script echo "Sending conversion request to docling-serve API..." -python -c " -import json -import time -import sys -import os -import traceback - -try: - # Load request data from temporary location - with open('$API_DIR/request.json', 'r') as f: - request_data = json.load(f) - - print(f'Request to convert URL: {request_data[\"http_sources\"][0][\"url\"]}') - print(f'Output format: {request_data[\"options\"][\"to_formats\"][0]}') - print(f'OCR enabled: {request_data[\"options\"][\"ocr\"]}') - - # Try requests first, fall back to urllib - try: - import requests - print('Using requests library for API call') - - # Record start time for timing - start_time = time.time() - print(f'Starting conversion request at {time.strftime(\"%H:%M:%S\")}') - - response = requests.post( - '$DOCLING_API_ENDPOINT', - json=request_data, - timeout=300 # 5 minutes timeout - ) - - elapsed = time.time() - start_time - print(f'Conversion request completed in {elapsed:.2f} seconds') - print(f'Response status code: {response.status_code}') - - # Save the full response for debugging - with open('$API_DIR/full_response.txt', 'w') as f: - f.write(f'Status code: {response.status_code}\\n') - f.write(f'Headers: {response.headers}\\n\\n') - f.write(f'Content: {response.text[:10000]}...' if len(response.text) > 10000 else f'Content: {response.text}') - - if response.status_code == 200: - with open('$API_DIR/response.json', 'w') as f: - f.write(response.text) - - # Parse the response even if it's not valid JSON - try: - resp_data = response.json() - print('Successfully parsed response as JSON') - - # Save detailed diagnostics about the response structure - with open('$API_DIR/response_structure.txt', 'w') as f: - f.write(f'Response keys: {list(resp_data.keys())}\\n') - if 'document' in resp_data: - f.write(f'Document keys: {list(resp_data[\"document\"].keys() if resp_data[\"document\"] else [])}\\n') - - # Check for specific content fields with null safety - doc = resp_data['document'] or {} - if 'html_content' in doc and doc['html_content']: - f.write(f'HTML content length: {len(doc[\"html_content\"])}\\n') - elif 'html_content' in doc: - f.write('HTML content is present but empty or null\\n') - - if 'md_content' in doc and doc['md_content']: - f.write(f'Markdown content length: {len(doc[\"md_content\"])}\\n') - elif 'md_content' in doc: - f.write('Markdown content is present but empty or null\\n') - - if 'text_content' in doc and doc['text_content']: - f.write(f'Text content length: {len(doc[\"text_content\"])}\\n') - elif 'text_content' in doc: - f.write('Text content is present but empty or null\\n') - - if 'json_content' in doc and doc['json_content']: - f.write(f'JSON content length: {len(doc[\"json_content\"])}\\n') - elif 'json_content' in doc: - f.write('JSON content is present but empty or null\\n') - - if 'outputs' in resp_data: - f.write(f'Outputs count: {len(resp_data[\"outputs\"])}\\n') - if resp_data['outputs']: - f.write(f'First output keys: {list(resp_data[\"outputs\"][0].keys())}\\n') - if 'files' in resp_data['outputs'][0]: - f.write(f'Files count: {len(resp_data[\"outputs\"][0][\"files\"])}\\n') - if resp_data['outputs'][0]['files']: - f.write(f'First file keys: {list(resp_data[\"outputs\"][0][\"files\"][0].keys())}\\n') - if 'content' in resp_data['outputs'][0]['files'][0]: - content_length = len(resp_data['outputs'][0]['files'][0]['content']) - f.write(f'Content length: {content_length}\\n') - - # Process the response - check for outputs and files - if 'outputs' in resp_data and resp_data['outputs']: - output = resp_data['outputs'][0] - print(f'Found {len(resp_data[\"outputs\"])} outputs in response') - - if 'files' in output and output['files']: - file_data = output['files'][0] - print(f'Found {len(output[\"files\"])} files in output') - - if 'content' in file_data and file_data['content']: - print(f'Found content in file (length: {len(file_data[\"content\"])})') - with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f: - f.write(file_data['content']) - print('CONVERSION SUCCESS') - sys.exit(0) - else: - if 'content' in file_data: - print('Content field exists but is empty') - else: - print('No content field in file data') - print(f'Available fields: {list(file_data.keys())}') - else: - print('No files found in output') - print(f'Available fields: {list(output.keys())}') - - # Alternative response format check - document field - elif 'document' in resp_data and resp_data['status'] == 'success': - print('Found alternative response format with document field') - document = resp_data['document'] or {} - - # Check format fields in document to see what's available - available_formats = [] - if 'html_content' in document and document['html_content']: - available_formats.append(('html', document['html_content'])) - if 'md_content' in document and document['md_content']: - available_formats.append(('md', document['md_content'])) - if 'text_content' in document and document['text_content']: - available_formats.append(('text', document['text_content'])) - if 'json_content' in document and document['json_content']: - available_formats.append(('json', document['json_content'])) - - if available_formats: - print(f'Found {len(available_formats)} available formats: {[f[0] for f in available_formats]}') - - # First try to find the exact requested format - requested_format_match = next((f for f in available_formats if f[0] == '$OUTPUT_FORMAT'.lower()), None) - - if requested_format_match: - format_type, content = requested_format_match - print(f'Found content in requested format {format_type} (length: {len(content)})') - else: - # If requested format not found, use the first available - format_type, content = available_formats[0] - print(f'Requested format not found, using alternative format {format_type} (length: {len(content)})') - - # Save the content to the output file with appropriate extension - with open(f'$API_DIR/output.{format_type}', 'w') as f: - f.write(content) - - # If we're using a different format than requested, also save with requested extension - if format_type != '$OUTPUT_FORMAT'.lower(): - print(f'Saving content with requested extension {format_type} -> $OUTPUT_FORMAT') - with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f: - f.write(content) - - print('CONVERSION SUCCESS') - sys.exit(0) - else: - # No content fields found or all are empty - # Check if fields exist but are empty or null - empty_fields = [] - if 'html_content' in document and not document['html_content']: - empty_fields.append('html_content') - if 'md_content' in document and not document['md_content']: - empty_fields.append('md_content') - if 'text_content' in document and not document['text_content']: - empty_fields.append('text_content') - - if empty_fields: - print(f'Found content fields but they are empty or null: {empty_fields}') - else: - print('No content fields found in document') - - print(f'Available fields in document: {list(document.keys() if document else [])}') - else: - print('No outputs found in response') - print(f'Available fields: {list(resp_data.keys())}') - - # Try to extract any alternate formats or metadata - if 'metadata' in resp_data: - print('Metadata found in response, saving to file') - with open('$API_DIR/metadata.json', 'w') as f: - json.dump(resp_data['metadata'], f, indent=2) - - print('CONVERSION PARTIAL - Some data available but not complete') - except Exception as json_error: - print(f'Failed to parse response as JSON: {json_error}') - traceback.print_exc() - - # Save raw content as text if JSON parsing fails - with open('$API_DIR/output.txt', 'w') as f: - f.write(response.text) - print('Saved raw response as text file') - print('CONVERSION PARTIAL - Raw response saved') - else: - print(f'Error response: {response.text[:500]}') - print('CONVERSION FAILED') - - except ImportError: - # Fall back to urllib - import urllib.request - import urllib.error - - print('Using urllib library for API call') - headers = {'Content-Type': 'application/json'} - req_data = json.dumps(request_data).encode('utf-8') - - req = urllib.request.Request( - '$DOCLING_API_ENDPOINT', - data=req_data, - headers=headers, - method='POST' - ) - - try: - start_time = time.time() - print(f'Starting conversion request at {time.strftime(\"%H:%M:%S\")}') - - with urllib.request.urlopen(req, timeout=300) as response: - elapsed = time.time() - start_time - print(f'Conversion request completed in {elapsed:.2f} seconds') - print(f'Response status: {response.status}') - - if response.status == 200: - response_text = response.read().decode('utf-8') - - # Save full response for debugging - with open('$API_DIR/full_response.txt', 'w') as f: - f.write(f'Status: {response.status}\\n') - f.write(f'Headers: {response.headers}\\n\\n') - f.write(f'Content: {response_text[:10000]}...' if len(response_text) > 10000 else f'Content: {response_text}') - - with open('$API_DIR/response.json', 'w') as f: - f.write(response_text) - - try: - resp_data = json.loads(response_text) - print('Successfully parsed response as JSON') - - # Save detailed diagnostics about the response structure - with open('$API_DIR/response_structure.txt', 'w') as f: - f.write(f'Response keys: {list(resp_data.keys())}\\n') - if 'document' in resp_data: - f.write(f'Document keys: {list(resp_data[\"document\"].keys() if resp_data[\"document\"] else [])}\\n') - - # Check for specific content fields with null safety - doc = resp_data['document'] or {} - if 'html_content' in doc and doc['html_content']: - f.write(f'HTML content length: {len(doc[\"html_content\"])}\\n') - elif 'html_content' in doc: - f.write('HTML content is present but empty or null\\n') - - if 'md_content' in doc and doc['md_content']: - f.write(f'Markdown content length: {len(doc[\"md_content\"])}\\n') - elif 'md_content' in doc: - f.write('Markdown content is present but empty or null\\n') - - if 'text_content' in doc and doc['text_content']: - f.write(f'Text content length: {len(doc[\"text_content\"])}\\n') - elif 'text_content' in doc: - f.write('Text content is present but empty or null\\n') - - if 'json_content' in doc and doc['json_content']: - f.write(f'JSON content length: {len(doc[\"json_content\"])}\\n') - elif 'json_content' in doc: - f.write('JSON content is present but empty or null\\n') - - if 'outputs' in resp_data: - f.write(f'Outputs count: {len(resp_data[\"outputs\"])}\\n') - if resp_data['outputs']: - f.write(f'First output keys: {list(resp_data[\"outputs\"][0].keys())}\\n') - if 'files' in resp_data['outputs'][0]: - f.write(f'Files count: {len(resp_data[\"outputs\"][0][\"files\"])}\\n') - if resp_data['outputs'][0]['files']: - f.write(f'First file keys: {list(resp_data[\"outputs\"][0][\"files\"][0].keys())}\\n') - if 'content' in resp_data['outputs'][0]['files'][0]: - content_length = len(resp_data['outputs'][0]['files'][0]['content']) - f.write(f'Content length: {content_length}\\n') - - if 'outputs' in resp_data and resp_data['outputs']: - output = resp_data['outputs'][0] - print(f'Found {len(resp_data[\"outputs\"])} outputs in response') - - if 'files' in output and output['files']: - file_data = output['files'][0] - print(f'Found {len(output[\"files\"])} files in output') - - if 'content' in file_data and file_data['content']: - print(f'Found content in file (length: {len(file_data[\"content\"])})') - with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f: - f.write(file_data['content']) - print('CONVERSION SUCCESS') - sys.exit(0) - else: - if 'content' in file_data: - print('Content field exists but is empty') - else: - print('No content field in file data') - print(f'Available fields: {list(file_data.keys())}') - else: - print('No files found in output') - print(f'Available fields: {list(output.keys())}') - - # Alternative response format check - document field - elif 'document' in resp_data and resp_data['status'] == 'success': - print('Found alternative response format with document field') - document = resp_data['document'] or {} - - # Check format fields in document to see what's available - available_formats = [] - if 'html_content' in document and document['html_content']: - available_formats.append(('html', document['html_content'])) - if 'md_content' in document and document['md_content']: - available_formats.append(('md', document['md_content'])) - if 'text_content' in document and document['text_content']: - available_formats.append(('text', document['text_content'])) - if 'json_content' in document and document['json_content']: - available_formats.append(('json', document['json_content'])) - - if available_formats: - print(f'Found {len(available_formats)} available formats: {[f[0] for f in available_formats]}') - - # First try to find the exact requested format - requested_format_match = next((f for f in available_formats if f[0] == '$OUTPUT_FORMAT'.lower()), None) - - if requested_format_match: - format_type, content = requested_format_match - print(f'Found content in requested format {format_type} (length: {len(content)})') - else: - # If requested format not found, use the first available - format_type, content = available_formats[0] - print(f'Requested format not found, using alternative format {format_type} (length: {len(content)})') - - # Save the content to the output file with appropriate extension - with open(f'$API_DIR/output.{format_type}', 'w') as f: - f.write(content) - - # If we're using a different format than requested, also save with requested extension - if format_type != '$OUTPUT_FORMAT'.lower(): - print(f'Saving content with requested extension {format_type} -> $OUTPUT_FORMAT') - with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f: - f.write(content) - - print('CONVERSION SUCCESS') - sys.exit(0) - else: - # No content fields found or all are empty - # Check if fields exist but are empty or null - empty_fields = [] - if 'html_content' in document and not document['html_content']: - empty_fields.append('html_content') - if 'md_content' in document and not document['md_content']: - empty_fields.append('md_content') - if 'text_content' in document and not document['text_content']: - empty_fields.append('text_content') - - if empty_fields: - print(f'Found content fields but they are empty or null: {empty_fields}') - else: - print('No content fields found in document') - - print(f'Available fields in document: {list(document.keys() if document else [])}') - else: - print('No outputs found in response') - print(f'Available fields: {list(resp_data.keys())}') - - print('CONVERSION PARTIAL - Some data available but not complete') - except Exception as json_error: - print(f'Failed to parse response as JSON: {json_error}') - traceback.print_exc() - - # Save raw content as text if JSON parsing fails - with open('$API_DIR/output.txt', 'w') as f: - f.write(response_text) - print('Saved raw response as text file') - print('CONVERSION PARTIAL - Raw response saved') - else: - print(f'Error status: {response.status}') - print('CONVERSION FAILED') - except urllib.error.HTTPError as e: - print(f'HTTP Error: {e.code} - {e.reason}') - print(f'Response body: {e.read().decode(\"utf-8\")[:500]}') - print('CONVERSION FAILED') - except urllib.error.URLError as e: - print(f'URL Error: {e.reason}') - print('CONVERSION FAILED') - except Exception as e: - print(f'Unexpected error during urllib request: {e}') - traceback.print_exc() - print('CONVERSION FAILED') -except Exception as e: - print(f'Error during conversion: {e}') - traceback.print_exc() - print('CONVERSION FAILED') -" 2>&1 +python "$TOOLS_DIR/docling_processor.py" \ + --api-endpoint "$DOCLING_API_ENDPOINT" \ + --request-json "$API_DIR/request.json" \ + --output-dir "$API_DIR" \ + --output-format "$OUTPUT_FORMAT" +PYTHON_EXIT_CODE=$? # --- Check for various potential output files --- @@ -732,46 +425,8 @@ if [ -f "$API_DIR/output.$OUTPUT_FORMAT" ]; then echo "Your document is available at: ${RESULT_URL}" echo "==============================" - # Find the Apify CLI again (reusing the function's logic would be better, but for clarity we'll repeat) - APIFY_CMD="" - for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do - if command -v "$cmd" &> /dev/null; then - APIFY_CMD="$cmd" - break - fi - done - - if [ -n "$APIFY_CMD" ]; then - # Add record to dataset with enhanced version check prevention - echo "Adding record to dataset..." - - # Create a temporary home directory with write permissions - export TMPDIR="/tmp/apify-home-${RANDOM}" - mkdir -p "$TMPDIR" - - # Multiple strategies to disable version checking - export APIFY_DISABLE_VERSION_CHECK=1 - export NODE_OPTIONS="--no-warnings" - export HOME="$TMPDIR" # Override home directory to writable location - - # Use the --no-update-notifier flag if available - if $APIFY_CMD --help | grep -q "\--no-update-notifier"; then - if $APIFY_CMD --no-update-notifier actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"output_file\": \"${RESULT_URL}\", \"status\": \"success\"}"; then - echo "Successfully added record to dataset" - else - echo "Warning: Failed to add record to dataset" - fi - else - # Fall back to regular command - if $APIFY_CMD actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"output_file\": \"${RESULT_URL}\", \"status\": \"success\"}"; then - echo "Successfully added record to dataset" - else - echo "Warning: Failed to add record to dataset" - fi - fi - - rm -rf "$TMPDIR" 2>/dev/null || true # Clean up temp dir - fi + # Push data to dataset + push_to_dataset "$DOCUMENT_URL" "$RESULT_URL" fi else echo "ERROR: No converted output file found at $API_DIR/output.$OUTPUT_FORMAT" diff --git a/.actor/docling_processor.py b/.actor/docling_processor.py new file mode 100755 index 00000000..12f10712 --- /dev/null +++ b/.actor/docling_processor.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 +""" +Document Processing Script for Docling-Serve API + +This script handles the communication with the docling-serve API, +processes the conversion request, and saves the output to the specified location. +""" + +import argparse +import json +import os +import sys +import time +import traceback +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +# Global constants +DEFAULT_TIMEOUT = 300 # 5 minutes +OUTPUT_FORMATS = ["md", "html", "json", "text"] + + +def setup_arg_parser() -> argparse.ArgumentParser: + """Set up command line argument parser.""" + parser = argparse.ArgumentParser(description="Process documents using docling-serve API") + parser.add_argument("--api-endpoint", required=True, help="Docling API endpoint URL") + parser.add_argument("--request-json", required=True, help="Path to JSON file with request data") + parser.add_argument("--output-dir", required=True, help="Directory to save output files") + parser.add_argument("--output-format", required=True, help="Desired output format") + parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Request timeout in seconds") + return parser + + +def load_request_data(json_path: str) -> Dict: + """Load request data from JSON file.""" + try: + with open(json_path, 'r') as f: + return json.load(f) + except (json.JSONDecodeError, FileNotFoundError) as e: + print(f"Error loading request data: {e}") + sys.exit(1) + + +def save_response_diagnostics(response_text: str, status_code: int, headers: Dict, output_dir: str) -> None: + """Save full response and headers for debugging.""" + with open(os.path.join(output_dir, "full_response.txt"), 'w') as f: + f.write(f"Status code: {status_code}\n") + f.write(f"Headers: {headers}\n\n") + + # Truncate very long responses + if len(response_text) > 10000: + f.write(f"Content: {response_text[:10000]}...") + else: + f.write(f"Content: {response_text}") + + +def save_response_json(response_text: str, output_dir: str) -> None: + """Save raw response JSON.""" + with open(os.path.join(output_dir, "response.json"), 'w') as f: + f.write(response_text) + + +def save_structure_info(data: Dict, output_dir: str) -> None: + """Save detailed information about response structure.""" + with open(os.path.join(output_dir, "response_structure.txt"), 'w') as f: + f.write(f'Response keys: {list(data.keys())}\n') + + # Document content details + if 'document' in data: + doc = data['document'] or {} + f.write(f'Document keys: {list(doc.keys() if doc else [])}\n') + + # Check specific content fields + for content_type in ['html_content', 'md_content', 'text_content', 'json_content']: + if content_type in doc and doc[content_type]: + f.write(f'{content_type.replace("_content", "").upper()} content length: {len(doc[content_type])}\n') + elif content_type in doc: + f.write(f'{content_type.replace("_content", "").upper()} content is present but empty or null\n') + + # Output structure details + if 'outputs' in data: + f.write(f'Outputs count: {len(data["outputs"])}\n') + if data['outputs']: + output = data['outputs'][0] + f.write(f'First output keys: {list(output.keys())}\n') + + if 'files' in output: + f.write(f'Files count: {len(output["files"])}\n') + if output['files']: + file_data = output['files'][0] + f.write(f'First file keys: {list(file_data.keys())}\n') + if 'content' in file_data: + content_length = len(file_data['content']) + f.write(f'Content length: {content_length}\n') + + +def extract_content_from_file_output(data: Dict, output_format: str, output_dir: str) -> bool: + """Extract content from 'files' output format.""" + if 'outputs' not in data or not data['outputs']: + print('No outputs found in response') + return False + + output = data['outputs'][0] + if 'files' not in output or not output['files']: + print('No files found in output') + print(f'Available fields: {list(output.keys())}') + return False + + file_data = output['files'][0] + if 'content' not in file_data or not file_data['content']: + if 'content' in file_data: + print('Content field exists but is empty') + else: + print('No content field in file data') + print(f'Available fields: {list(file_data.keys())}') + return False + + # Content found, save it + content = file_data['content'] + print(f'Found content in file (length: {len(content)})') + with open(os.path.join(output_dir, f"output.{output_format}"), 'w') as f: + f.write(content) + print('CONVERSION SUCCESS') + return True + + +def extract_content_from_document(data: Dict, output_format: str, output_dir: str) -> bool: + """Extract content from 'document' response format.""" + if 'document' not in data or data.get('status') != 'success': + print('No document field or success status found in response') + return False + + document = data['document'] or {} + + # Check available formats + available_formats = [] + for fmt in ['html', 'md', 'text', 'json']: + content_field = f'{fmt}_content' + if content_field in document and document[content_field]: + available_formats.append((fmt, document[content_field])) + + if not available_formats: + # Check for empty fields + empty_fields = [] + for fmt in ['html', 'md', 'text', 'json']: + content_field = f'{fmt}_content' + if content_field in document and not document[content_field]: + empty_fields.append(content_field) + + if empty_fields: + print(f'Found content fields but they are empty or null: {empty_fields}') + else: + print('No content fields found in document') + + print(f'Available fields in document: {list(document.keys() if document else [])}') + return False + + # Found available formats + print(f'Found {len(available_formats)} available formats: {[f[0] for f in available_formats]}') + + # First try to find exact requested format + requested_format_match = next((f for f in available_formats if f[0] == output_format.lower()), None) + + if requested_format_match: + format_type, content = requested_format_match + print(f'Found content in requested format {format_type} (length: {len(content)})') + else: + # If requested format not found, use the first available + format_type, content = available_formats[0] + print(f'Requested format not found, using alternative format {format_type} (length: {len(content)})') + + # Save with the matched format's extension + with open(os.path.join(output_dir, f"output.{format_type}"), 'w') as f: + f.write(content) + + # If we're using a different format than requested, also save with requested extension + if format_type != output_format.lower(): + print(f'Saving content with requested extension {format_type} -> {output_format}') + with open(os.path.join(output_dir, f"output.{output_format}"), 'w') as f: + f.write(content) + + print('CONVERSION SUCCESS') + return True + + +def process_success_response(response_text: str, output_format: str, output_dir: str) -> bool: + """Process a successful response and extract document content.""" + try: + # Save raw response + save_response_json(response_text, output_dir) + + # Parse JSON + data = json.loads(response_text) + print('Successfully parsed response as JSON') + + # Save detailed structure info + save_structure_info(data, output_dir) + + # Try both response formats + if extract_content_from_file_output(data, output_format, output_dir): + return True + + if extract_content_from_document(data, output_format, output_dir): + return True + + # Check for metadata + if 'metadata' in data: + print('Metadata found in response, saving to file') + with open(os.path.join(output_dir, "metadata.json"), 'w') as f: + json.dump(data['metadata'], f, indent=2) + + print('CONVERSION PARTIAL - Some data available but not complete') + return False + + except Exception as json_error: + print(f'Failed to parse response as JSON: {json_error}') + traceback.print_exc() + + # Save raw content as text if JSON parsing fails + with open(os.path.join(output_dir, "output.txt"), 'w') as f: + f.write(response_text) + print('Saved raw response as text file') + print('CONVERSION PARTIAL - Raw response saved') + return False + + +def process_requests_api(api_endpoint: str, request_data: Dict, output_format: str, output_dir: str, timeout: int) -> bool: + """Process using requests library.""" + try: + import requests + print('Using requests library for API call') + + # Record start time for timing + start_time = time.time() + print(f'Starting conversion request at {time.strftime("%H:%M:%S")}') + + response = requests.post( + api_endpoint, + json=request_data, + timeout=timeout + ) + + elapsed = time.time() - start_time + print(f'Conversion request completed in {elapsed:.2f} seconds') + print(f'Response status code: {response.status_code}') + + # Save response diagnostics + save_response_diagnostics(response.text, response.status_code, dict(response.headers), output_dir) + + if response.status_code == 200: + return process_success_response(response.text, output_format, output_dir) + else: + print(f'Error response: {response.text[:500]}') + print('CONVERSION FAILED') + return False + + except Exception as e: + print(f'Error during requests API call: {e}') + traceback.print_exc() + print('CONVERSION FAILED') + return False + + +def process_urllib_api(api_endpoint: str, request_data: Dict, output_format: str, output_dir: str, timeout: int) -> bool: + """Process using urllib as fallback.""" + try: + import urllib.request + import urllib.error + + print('Using urllib library for API call') + headers = {'Content-Type': 'application/json'} + req_data = json.dumps(request_data).encode('utf-8') + + req = urllib.request.Request( + api_endpoint, + data=req_data, + headers=headers, + method='POST' + ) + + try: + start_time = time.time() + print(f'Starting conversion request at {time.strftime("%H:%M:%S")}') + + with urllib.request.urlopen(req, timeout=timeout) as response: + elapsed = time.time() - start_time + print(f'Conversion request completed in {elapsed:.2f} seconds') + print(f'Response status: {response.status}') + + response_text = response.read().decode('utf-8') + save_response_diagnostics(response_text, response.status, dict(response.headers), output_dir) + + if response.status == 200: + return process_success_response(response_text, output_format, output_dir) + else: + print(f'Error status: {response.status}') + print('CONVERSION FAILED') + return False + + except urllib.error.HTTPError as e: + print(f'HTTP Error: {e.code} - {e.reason}') + print(f'Response body: {e.read().decode("utf-8")[:500]}') + print('CONVERSION FAILED') + return False + + except urllib.error.URLError as e: + print(f'URL Error: {e.reason}') + print('CONVERSION FAILED') + return False + + except Exception as e: + print(f'Unexpected error during urllib request: {e}') + traceback.print_exc() + print('CONVERSION FAILED') + return False + + except Exception as e: + print(f'Error setting up urllib: {e}') + traceback.print_exc() + print('CONVERSION FAILED') + return False + + +def process_document(api_endpoint: str, request_json_path: str, output_format: str, + output_dir: str, timeout: int) -> bool: + """Main function to process a document through the docling-serve API.""" + try: + # Ensure output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Load request data + request_data = load_request_data(request_json_path) + + # Log request info + if 'http_sources' in request_data and request_data['http_sources']: + print(f'Request to convert URL: {request_data["http_sources"][0]["url"]}') + + if 'options' in request_data: + options = request_data['options'] + if 'to_formats' in options and options['to_formats']: + print(f'Output format: {options["to_formats"][0]}') + if 'ocr' in options: + print(f'OCR enabled: {options["ocr"]}') + + # Try requests first, fall back to urllib + try: + return process_requests_api(api_endpoint, request_data, output_format, output_dir, timeout) + except ImportError: + return process_urllib_api(api_endpoint, request_data, output_format, output_dir, timeout) + + except Exception as e: + print(f'Error during conversion: {e}') + traceback.print_exc() + print('CONVERSION FAILED') + return False + + +def main(): + """Main entry point.""" + parser = setup_arg_parser() + args = parser.parse_args() + + success = process_document( + api_endpoint=args.api_endpoint, + request_json_path=args.request_json, + output_format=args.output_format, + output_dir=args.output_dir, + timeout=args.timeout + ) + + # Exit with appropriate code + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file