Actor: Refactor actor.sh and add docling_processor.py

Refactor the `actor.sh` script to modularize functions for finding the Apify CLI, setting up a temporary environment, and cleaning it up. Introduce a new function, `get_actor_input()`, to handle input detection more robustly. Replace inline Python conversion logic with an external script, `docling_processor.py`, for processing documents via the docling-serve API.

Signed-off-by: Václav Vančura <commit@vancura.dev>
This commit is contained in:
Václav Vančura 2025-03-09 15:51:39 +01:00 committed by Adam Kliment
parent 7a5dc3c438
commit 5f5c0a9d50
3 changed files with 518 additions and 486 deletions

View File

@ -62,6 +62,7 @@ ENV EASYOCR_MODULE_PATH=/tmp/easyocr-models
COPY --chown=1000:1000 .actor/actor.sh .actor/actor.sh
COPY --chown=1000:1000 .actor/actor.json .actor/actor.json
COPY --chown=1000:1000 .actor/input_schema.json .actor/input_schema.json
COPY --chown=1000:1000 .actor/docling_processor.py .actor/docling_processor.py
RUN chmod +x .actor/actor.sh
# Copy the build files from builder

View File

@ -8,25 +8,14 @@ upload_to_kvs() {
local description="$4"
# Find the Apify CLI command
local apify_cmd=""
for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do
if command -v "$cmd" &> /dev/null; then
apify_cmd="$cmd"
break
fi
done
find_apify_cmd
local apify_cmd="$FOUND_APIFY_CMD"
if [ -n "$apify_cmd" ]; then
echo "Uploading $description to key-value store (key: $key_name)..."
# Create a temporary home directory with write permissions
export TMPDIR="/tmp/apify-home-${RANDOM}"
mkdir -p "$TMPDIR"
# Multiple strategies to disable version checking
export APIFY_DISABLE_VERSION_CHECK=1
export NODE_OPTIONS="--no-warnings"
export HOME="$TMPDIR" # Override home directory to writable location
setup_temp_environment
# Use the --no-update-notifier flag if available
if $apify_cmd --help | grep -q "\--no-update-notifier"; then
@ -34,7 +23,7 @@ upload_to_kvs() {
echo "Successfully uploaded $description to key-value store"
local url="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/$key_name"
echo "$description available at: $url"
rm -rf "$TMPDIR" 2>/dev/null || true # Clean up temp dir
cleanup_temp_environment
return 0
fi
else
@ -43,13 +32,13 @@ upload_to_kvs() {
echo "Successfully uploaded $description to key-value store"
local url="https://api.apify.com/v2/key-value-stores/${APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/$key_name"
echo "$description available at: $url"
rm -rf "$TMPDIR" 2>/dev/null || true # Clean up temp dir
cleanup_temp_environment
return 0
fi
fi
echo "ERROR: Failed to upload $description to key-value store"
rm -rf "$TMPDIR" 2>/dev/null || true # Clean up temp dir
cleanup_temp_environment
return 1
else
echo "ERROR: Apify CLI not found for $description upload"
@ -57,6 +46,64 @@ upload_to_kvs() {
fi
}
# Function to find Apify CLI command
find_apify_cmd() {
FOUND_APIFY_CMD=""
for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do
if command -v "$cmd" &> /dev/null; then
FOUND_APIFY_CMD="$cmd"
break
fi
done
}
# Function to set up temporary environment for Apify CLI
setup_temp_environment() {
export TMPDIR="/tmp/apify-home-${RANDOM}"
mkdir -p "$TMPDIR"
export APIFY_DISABLE_VERSION_CHECK=1
export NODE_OPTIONS="--no-warnings"
export HOME="$TMPDIR" # Override home directory to writable location
}
# Function to clean up temporary environment
cleanup_temp_environment() {
rm -rf "$TMPDIR" 2>/dev/null || true
}
# Function to push data to Apify dataset
push_to_dataset() {
local document_url="$1"
local result_url="$2"
# Find Apify CLI command
find_apify_cmd
local apify_cmd="$FOUND_APIFY_CMD"
if [ -n "$apify_cmd" ]; then
echo "Adding record to dataset..."
setup_temp_environment
# Use the --no-update-notifier flag if available
if $apify_cmd --help | grep -q "\--no-update-notifier"; then
if $apify_cmd --no-update-notifier actor:push-data "{\"url\": \"${document_url}\", \"output_file\": \"${result_url}\", \"status\": \"success\"}"; then
echo "Successfully added record to dataset"
else
echo "Warning: Failed to add record to dataset"
fi
else
# Fall back to regular command
if $apify_cmd actor:push-data "{\"url\": \"${document_url}\", \"output_file\": \"${result_url}\", \"status\": \"success\"}"; then
echo "Successfully added record to dataset"
else
echo "Warning: Failed to add record to dataset"
fi
fi
cleanup_temp_environment
fi
}
# --- Setup logging and error handling ---
@ -98,6 +145,17 @@ else
echo "Warning: No build files directory found. Some tools may be unavailable."
fi
# Copy Python processor script to tools directory
PYTHON_SCRIPT_PATH="$(dirname "$0")/docling_processor.py"
if [ -f "$PYTHON_SCRIPT_PATH" ]; then
echo "Copying Python processor script to tools directory..."
cp "$PYTHON_SCRIPT_PATH" "$TOOLS_DIR/"
chmod +x "$TOOLS_DIR/docling_processor.py"
else
echo "ERROR: Python processor script not found at $PYTHON_SCRIPT_PATH"
exit 1
fi
# Check OCR directories and ensure they're writable
echo "Checking OCR directory permissions..."
OCR_DIR="/opt/app-root/src/.EasyOCR"
@ -108,6 +166,7 @@ if [ -d "$OCR_DIR" ]; then
rm "$OCR_DIR/test_write"
else
echo "[✗] OCR directory is not writable, setting up alternative in /tmp"
# Create alternative in /tmp (which is writable)
mkdir -p "/tmp/.EasyOCR/user_network"
export EASYOCR_MODULE_PATH="/tmp/.EasyOCR"
@ -224,41 +283,52 @@ DOCLING_API_ENDPOINT="http://localhost:5001/v1alpha/convert/source"
echo "Starting document processing..."
echo "Reading input from Apify..."
INPUT=""
# Function to handle Actor input detection
get_actor_input() {
local input=""
# Create directory if it doesn't exist
mkdir -p "/tmp/actor-input" || echo "Warning: Could not create /tmp/actor-input directory"
# Create directory if it doesn't exist
mkdir -p "/tmp/actor-input" || echo "Warning: Could not create /tmp/actor-input directory" >&2
# List all possible input locations for debugging
echo "Listing potential input file locations:"
ls -la "/tmp/actor-input/" 2>/dev/null || echo "Cannot list /tmp/actor-input/"
ls -la "/input/" 2>/dev/null || echo "Cannot list /input/"
# If /tmp/actor-input/INPUT exists as a directory, remove it
if [ -d "/tmp/actor-input/INPUT" ]; then
echo "Warning: /tmp/actor-input/INPUT exists as a directory. Removing it to create a file." >&2
rm -rf "/tmp/actor-input/INPUT"
fi
# Check multiple potential locations for input file
if [ -f "/tmp/actor-input/INPUT" ]; then
echo "Found standard Actor input file at /tmp/actor-input/INPUT"
echo "Content:"
cat "/tmp/actor-input/INPUT"
INPUT=$(cat "/tmp/actor-input/INPUT")
elif [ -f "/input/INPUT" ]; then
echo "Found Actor input file at /input/INPUT"
echo "Content:"
cat "/input/INPUT"
INPUT=$(cat "/input/INPUT")
# Fallback to environment variable
elif [ -n "$APIFY_INPUT_JSON" ]; then
echo "Using APIFY_INPUT_JSON environment variable"
INPUT="$APIFY_INPUT_JSON"
# Last resort: use test input - now defaulting to md as requested
else
echo "No input found, using test input with md format"
# Check multiple potential locations for input file
if [ -f "/tmp/actor-input/INPUT" ]; then
echo "Found standard Actor input file at /tmp/actor-input/INPUT" >&2
input=$(cat "/tmp/actor-input/INPUT")
elif [ -f "/input/INPUT" ]; then
echo "Found Actor input file at /input/INPUT" >&2
input=$(cat "/input/INPUT")
# Fallback to environment variable
elif [ -n "$APIFY_INPUT_JSON" ]; then
echo "Using APIFY_INPUT_JSON environment variable" >&2
input="$APIFY_INPUT_JSON"
# Last resort: use test input with md format
else
echo "No input found, using test input with md format" >&2
TEST_INPUT='{"documentUrl":"https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf","ocr":true,"outputFormat":"md"}'
mkdir -p "/tmp/actor-input"
echo "$TEST_INPUT" > "/tmp/actor-input/INPUT"
INPUT="$TEST_INPUT"
fi
echo "Input content: $INPUT"
# Read back the test input to ensure we get clean JSON
input=$(cat "/tmp/actor-input/INPUT")
fi
# Return only the JSON content
echo "$input"
}
# Get actor input
INPUT=$(get_actor_input)
echo "Input content:" >&2
echo "$INPUT" >&2 # Send the raw input to stderr for debugging
echo "$INPUT" # Send the clean JSON to stdout for processing
# Extract values from INPUT using Python
echo "Using Python to parse input..."
@ -271,13 +341,17 @@ if [ -z "$DOCUMENT_URL" ]; then
echo "ERROR: No document URL provided in input"
# Try to push data to Actor but don't exit if it fails
if command -v actor &> /dev/null; then
find_apify_cmd
apify_cmd="$FOUND_APIFY_CMD"
if [ -n "$apify_cmd" ]; then
echo "Reporting missing document URL to Actor storage..."
if actor push-data "{\"status\": \"error\", \"error\": \"No document URL provided in input\"}" 2>&1; then
setup_temp_environment
if $apify_cmd actor:push-data "{\"status\": \"error\", \"error\": \"No document URL provided in input\"}" 2>&1; then
echo "Successfully pushed error message to Actor storage"
else
echo "Warning: Failed to push error message to Actor storage"
fi
cleanup_temp_environment
fi
# Use default document URL for testing instead of exiting
@ -290,410 +364,29 @@ if [ -z "$OUTPUT_FORMAT" ]; then
OUTPUT_FORMAT="md"
fi
# Ensure OCR_ENABLED has a valid boolean value
if [ -z "$OCR_ENABLED" ]; then
echo "No OCR setting specified, defaulting to true"
OCR_ENABLED="true"
fi
echo "Input values: documentUrl=$DOCUMENT_URL, outputFormat=$OUTPUT_FORMAT, ocr=$OCR_ENABLED"
# Create the request JSON
REQUEST_JSON="{\"options\":{\"to_formats\":[\"$OUTPUT_FORMAT\"],\"ocr\":$OCR_ENABLED},\"http_sources\":[{\"url\":\"$DOCUMENT_URL\"}]}"
echo "Creating request JSON:" >&2
echo "$REQUEST_JSON" >&2
echo "$REQUEST_JSON" > "$API_DIR/request.json"
# Send the conversion request
# Send the conversion request using our Python script
echo "Sending conversion request to docling-serve API..."
python -c "
import json
import time
import sys
import os
import traceback
try:
# Load request data from temporary location
with open('$API_DIR/request.json', 'r') as f:
request_data = json.load(f)
print(f'Request to convert URL: {request_data[\"http_sources\"][0][\"url\"]}')
print(f'Output format: {request_data[\"options\"][\"to_formats\"][0]}')
print(f'OCR enabled: {request_data[\"options\"][\"ocr\"]}')
# Try requests first, fall back to urllib
try:
import requests
print('Using requests library for API call')
# Record start time for timing
start_time = time.time()
print(f'Starting conversion request at {time.strftime(\"%H:%M:%S\")}')
response = requests.post(
'$DOCLING_API_ENDPOINT',
json=request_data,
timeout=300 # 5 minutes timeout
)
elapsed = time.time() - start_time
print(f'Conversion request completed in {elapsed:.2f} seconds')
print(f'Response status code: {response.status_code}')
# Save the full response for debugging
with open('$API_DIR/full_response.txt', 'w') as f:
f.write(f'Status code: {response.status_code}\\n')
f.write(f'Headers: {response.headers}\\n\\n')
f.write(f'Content: {response.text[:10000]}...' if len(response.text) > 10000 else f'Content: {response.text}')
if response.status_code == 200:
with open('$API_DIR/response.json', 'w') as f:
f.write(response.text)
# Parse the response even if it's not valid JSON
try:
resp_data = response.json()
print('Successfully parsed response as JSON')
# Save detailed diagnostics about the response structure
with open('$API_DIR/response_structure.txt', 'w') as f:
f.write(f'Response keys: {list(resp_data.keys())}\\n')
if 'document' in resp_data:
f.write(f'Document keys: {list(resp_data[\"document\"].keys() if resp_data[\"document\"] else [])}\\n')
# Check for specific content fields with null safety
doc = resp_data['document'] or {}
if 'html_content' in doc and doc['html_content']:
f.write(f'HTML content length: {len(doc[\"html_content\"])}\\n')
elif 'html_content' in doc:
f.write('HTML content is present but empty or null\\n')
if 'md_content' in doc and doc['md_content']:
f.write(f'Markdown content length: {len(doc[\"md_content\"])}\\n')
elif 'md_content' in doc:
f.write('Markdown content is present but empty or null\\n')
if 'text_content' in doc and doc['text_content']:
f.write(f'Text content length: {len(doc[\"text_content\"])}\\n')
elif 'text_content' in doc:
f.write('Text content is present but empty or null\\n')
if 'json_content' in doc and doc['json_content']:
f.write(f'JSON content length: {len(doc[\"json_content\"])}\\n')
elif 'json_content' in doc:
f.write('JSON content is present but empty or null\\n')
if 'outputs' in resp_data:
f.write(f'Outputs count: {len(resp_data[\"outputs\"])}\\n')
if resp_data['outputs']:
f.write(f'First output keys: {list(resp_data[\"outputs\"][0].keys())}\\n')
if 'files' in resp_data['outputs'][0]:
f.write(f'Files count: {len(resp_data[\"outputs\"][0][\"files\"])}\\n')
if resp_data['outputs'][0]['files']:
f.write(f'First file keys: {list(resp_data[\"outputs\"][0][\"files\"][0].keys())}\\n')
if 'content' in resp_data['outputs'][0]['files'][0]:
content_length = len(resp_data['outputs'][0]['files'][0]['content'])
f.write(f'Content length: {content_length}\\n')
# Process the response - check for outputs and files
if 'outputs' in resp_data and resp_data['outputs']:
output = resp_data['outputs'][0]
print(f'Found {len(resp_data[\"outputs\"])} outputs in response')
if 'files' in output and output['files']:
file_data = output['files'][0]
print(f'Found {len(output[\"files\"])} files in output')
if 'content' in file_data and file_data['content']:
print(f'Found content in file (length: {len(file_data[\"content\"])})')
with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f:
f.write(file_data['content'])
print('CONVERSION SUCCESS')
sys.exit(0)
else:
if 'content' in file_data:
print('Content field exists but is empty')
else:
print('No content field in file data')
print(f'Available fields: {list(file_data.keys())}')
else:
print('No files found in output')
print(f'Available fields: {list(output.keys())}')
# Alternative response format check - document field
elif 'document' in resp_data and resp_data['status'] == 'success':
print('Found alternative response format with document field')
document = resp_data['document'] or {}
# Check format fields in document to see what's available
available_formats = []
if 'html_content' in document and document['html_content']:
available_formats.append(('html', document['html_content']))
if 'md_content' in document and document['md_content']:
available_formats.append(('md', document['md_content']))
if 'text_content' in document and document['text_content']:
available_formats.append(('text', document['text_content']))
if 'json_content' in document and document['json_content']:
available_formats.append(('json', document['json_content']))
if available_formats:
print(f'Found {len(available_formats)} available formats: {[f[0] for f in available_formats]}')
# First try to find the exact requested format
requested_format_match = next((f for f in available_formats if f[0] == '$OUTPUT_FORMAT'.lower()), None)
if requested_format_match:
format_type, content = requested_format_match
print(f'Found content in requested format {format_type} (length: {len(content)})')
else:
# If requested format not found, use the first available
format_type, content = available_formats[0]
print(f'Requested format not found, using alternative format {format_type} (length: {len(content)})')
# Save the content to the output file with appropriate extension
with open(f'$API_DIR/output.{format_type}', 'w') as f:
f.write(content)
# If we're using a different format than requested, also save with requested extension
if format_type != '$OUTPUT_FORMAT'.lower():
print(f'Saving content with requested extension {format_type} -> $OUTPUT_FORMAT')
with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f:
f.write(content)
print('CONVERSION SUCCESS')
sys.exit(0)
else:
# No content fields found or all are empty
# Check if fields exist but are empty or null
empty_fields = []
if 'html_content' in document and not document['html_content']:
empty_fields.append('html_content')
if 'md_content' in document and not document['md_content']:
empty_fields.append('md_content')
if 'text_content' in document and not document['text_content']:
empty_fields.append('text_content')
if empty_fields:
print(f'Found content fields but they are empty or null: {empty_fields}')
else:
print('No content fields found in document')
print(f'Available fields in document: {list(document.keys() if document else [])}')
else:
print('No outputs found in response')
print(f'Available fields: {list(resp_data.keys())}')
# Try to extract any alternate formats or metadata
if 'metadata' in resp_data:
print('Metadata found in response, saving to file')
with open('$API_DIR/metadata.json', 'w') as f:
json.dump(resp_data['metadata'], f, indent=2)
print('CONVERSION PARTIAL - Some data available but not complete')
except Exception as json_error:
print(f'Failed to parse response as JSON: {json_error}')
traceback.print_exc()
# Save raw content as text if JSON parsing fails
with open('$API_DIR/output.txt', 'w') as f:
f.write(response.text)
print('Saved raw response as text file')
print('CONVERSION PARTIAL - Raw response saved')
else:
print(f'Error response: {response.text[:500]}')
print('CONVERSION FAILED')
except ImportError:
# Fall back to urllib
import urllib.request
import urllib.error
print('Using urllib library for API call')
headers = {'Content-Type': 'application/json'}
req_data = json.dumps(request_data).encode('utf-8')
req = urllib.request.Request(
'$DOCLING_API_ENDPOINT',
data=req_data,
headers=headers,
method='POST'
)
try:
start_time = time.time()
print(f'Starting conversion request at {time.strftime(\"%H:%M:%S\")}')
with urllib.request.urlopen(req, timeout=300) as response:
elapsed = time.time() - start_time
print(f'Conversion request completed in {elapsed:.2f} seconds')
print(f'Response status: {response.status}')
if response.status == 200:
response_text = response.read().decode('utf-8')
# Save full response for debugging
with open('$API_DIR/full_response.txt', 'w') as f:
f.write(f'Status: {response.status}\\n')
f.write(f'Headers: {response.headers}\\n\\n')
f.write(f'Content: {response_text[:10000]}...' if len(response_text) > 10000 else f'Content: {response_text}')
with open('$API_DIR/response.json', 'w') as f:
f.write(response_text)
try:
resp_data = json.loads(response_text)
print('Successfully parsed response as JSON')
# Save detailed diagnostics about the response structure
with open('$API_DIR/response_structure.txt', 'w') as f:
f.write(f'Response keys: {list(resp_data.keys())}\\n')
if 'document' in resp_data:
f.write(f'Document keys: {list(resp_data[\"document\"].keys() if resp_data[\"document\"] else [])}\\n')
# Check for specific content fields with null safety
doc = resp_data['document'] or {}
if 'html_content' in doc and doc['html_content']:
f.write(f'HTML content length: {len(doc[\"html_content\"])}\\n')
elif 'html_content' in doc:
f.write('HTML content is present but empty or null\\n')
if 'md_content' in doc and doc['md_content']:
f.write(f'Markdown content length: {len(doc[\"md_content\"])}\\n')
elif 'md_content' in doc:
f.write('Markdown content is present but empty or null\\n')
if 'text_content' in doc and doc['text_content']:
f.write(f'Text content length: {len(doc[\"text_content\"])}\\n')
elif 'text_content' in doc:
f.write('Text content is present but empty or null\\n')
if 'json_content' in doc and doc['json_content']:
f.write(f'JSON content length: {len(doc[\"json_content\"])}\\n')
elif 'json_content' in doc:
f.write('JSON content is present but empty or null\\n')
if 'outputs' in resp_data:
f.write(f'Outputs count: {len(resp_data[\"outputs\"])}\\n')
if resp_data['outputs']:
f.write(f'First output keys: {list(resp_data[\"outputs\"][0].keys())}\\n')
if 'files' in resp_data['outputs'][0]:
f.write(f'Files count: {len(resp_data[\"outputs\"][0][\"files\"])}\\n')
if resp_data['outputs'][0]['files']:
f.write(f'First file keys: {list(resp_data[\"outputs\"][0][\"files\"][0].keys())}\\n')
if 'content' in resp_data['outputs'][0]['files'][0]:
content_length = len(resp_data['outputs'][0]['files'][0]['content'])
f.write(f'Content length: {content_length}\\n')
if 'outputs' in resp_data and resp_data['outputs']:
output = resp_data['outputs'][0]
print(f'Found {len(resp_data[\"outputs\"])} outputs in response')
if 'files' in output and output['files']:
file_data = output['files'][0]
print(f'Found {len(output[\"files\"])} files in output')
if 'content' in file_data and file_data['content']:
print(f'Found content in file (length: {len(file_data[\"content\"])})')
with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f:
f.write(file_data['content'])
print('CONVERSION SUCCESS')
sys.exit(0)
else:
if 'content' in file_data:
print('Content field exists but is empty')
else:
print('No content field in file data')
print(f'Available fields: {list(file_data.keys())}')
else:
print('No files found in output')
print(f'Available fields: {list(output.keys())}')
# Alternative response format check - document field
elif 'document' in resp_data and resp_data['status'] == 'success':
print('Found alternative response format with document field')
document = resp_data['document'] or {}
# Check format fields in document to see what's available
available_formats = []
if 'html_content' in document and document['html_content']:
available_formats.append(('html', document['html_content']))
if 'md_content' in document and document['md_content']:
available_formats.append(('md', document['md_content']))
if 'text_content' in document and document['text_content']:
available_formats.append(('text', document['text_content']))
if 'json_content' in document and document['json_content']:
available_formats.append(('json', document['json_content']))
if available_formats:
print(f'Found {len(available_formats)} available formats: {[f[0] for f in available_formats]}')
# First try to find the exact requested format
requested_format_match = next((f for f in available_formats if f[0] == '$OUTPUT_FORMAT'.lower()), None)
if requested_format_match:
format_type, content = requested_format_match
print(f'Found content in requested format {format_type} (length: {len(content)})')
else:
# If requested format not found, use the first available
format_type, content = available_formats[0]
print(f'Requested format not found, using alternative format {format_type} (length: {len(content)})')
# Save the content to the output file with appropriate extension
with open(f'$API_DIR/output.{format_type}', 'w') as f:
f.write(content)
# If we're using a different format than requested, also save with requested extension
if format_type != '$OUTPUT_FORMAT'.lower():
print(f'Saving content with requested extension {format_type} -> $OUTPUT_FORMAT')
with open('$API_DIR/output.$OUTPUT_FORMAT', 'w') as f:
f.write(content)
print('CONVERSION SUCCESS')
sys.exit(0)
else:
# No content fields found or all are empty
# Check if fields exist but are empty or null
empty_fields = []
if 'html_content' in document and not document['html_content']:
empty_fields.append('html_content')
if 'md_content' in document and not document['md_content']:
empty_fields.append('md_content')
if 'text_content' in document and not document['text_content']:
empty_fields.append('text_content')
if empty_fields:
print(f'Found content fields but they are empty or null: {empty_fields}')
else:
print('No content fields found in document')
print(f'Available fields in document: {list(document.keys() if document else [])}')
else:
print('No outputs found in response')
print(f'Available fields: {list(resp_data.keys())}')
print('CONVERSION PARTIAL - Some data available but not complete')
except Exception as json_error:
print(f'Failed to parse response as JSON: {json_error}')
traceback.print_exc()
# Save raw content as text if JSON parsing fails
with open('$API_DIR/output.txt', 'w') as f:
f.write(response_text)
print('Saved raw response as text file')
print('CONVERSION PARTIAL - Raw response saved')
else:
print(f'Error status: {response.status}')
print('CONVERSION FAILED')
except urllib.error.HTTPError as e:
print(f'HTTP Error: {e.code} - {e.reason}')
print(f'Response body: {e.read().decode(\"utf-8\")[:500]}')
print('CONVERSION FAILED')
except urllib.error.URLError as e:
print(f'URL Error: {e.reason}')
print('CONVERSION FAILED')
except Exception as e:
print(f'Unexpected error during urllib request: {e}')
traceback.print_exc()
print('CONVERSION FAILED')
except Exception as e:
print(f'Error during conversion: {e}')
traceback.print_exc()
print('CONVERSION FAILED')
" 2>&1
python "$TOOLS_DIR/docling_processor.py" \
--api-endpoint "$DOCLING_API_ENDPOINT" \
--request-json "$API_DIR/request.json" \
--output-dir "$API_DIR" \
--output-format "$OUTPUT_FORMAT"
PYTHON_EXIT_CODE=$?
# --- Check for various potential output files ---
@ -732,46 +425,8 @@ if [ -f "$API_DIR/output.$OUTPUT_FORMAT" ]; then
echo "Your document is available at: ${RESULT_URL}"
echo "=============================="
# Find the Apify CLI again (reusing the function's logic would be better, but for clarity we'll repeat)
APIFY_CMD=""
for cmd in "apify" "actor" "/usr/local/bin/apify" "/usr/bin/apify" "/opt/apify/cli/bin/apify"; do
if command -v "$cmd" &> /dev/null; then
APIFY_CMD="$cmd"
break
fi
done
if [ -n "$APIFY_CMD" ]; then
# Add record to dataset with enhanced version check prevention
echo "Adding record to dataset..."
# Create a temporary home directory with write permissions
export TMPDIR="/tmp/apify-home-${RANDOM}"
mkdir -p "$TMPDIR"
# Multiple strategies to disable version checking
export APIFY_DISABLE_VERSION_CHECK=1
export NODE_OPTIONS="--no-warnings"
export HOME="$TMPDIR" # Override home directory to writable location
# Use the --no-update-notifier flag if available
if $APIFY_CMD --help | grep -q "\--no-update-notifier"; then
if $APIFY_CMD --no-update-notifier actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"output_file\": \"${RESULT_URL}\", \"status\": \"success\"}"; then
echo "Successfully added record to dataset"
else
echo "Warning: Failed to add record to dataset"
fi
else
# Fall back to regular command
if $APIFY_CMD actor:push-data "{\"url\": \"${DOCUMENT_URL}\", \"output_file\": \"${RESULT_URL}\", \"status\": \"success\"}"; then
echo "Successfully added record to dataset"
else
echo "Warning: Failed to add record to dataset"
fi
fi
rm -rf "$TMPDIR" 2>/dev/null || true # Clean up temp dir
fi
# Push data to dataset
push_to_dataset "$DOCUMENT_URL" "$RESULT_URL"
fi
else
echo "ERROR: No converted output file found at $API_DIR/output.$OUTPUT_FORMAT"

376
.actor/docling_processor.py Executable file
View File

@ -0,0 +1,376 @@
#!/usr/bin/env python3
"""
Document Processing Script for Docling-Serve API
This script handles the communication with the docling-serve API,
processes the conversion request, and saves the output to the specified location.
"""
import argparse
import json
import os
import sys
import time
import traceback
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
# Global constants
DEFAULT_TIMEOUT = 300 # 5 minutes
OUTPUT_FORMATS = ["md", "html", "json", "text"]
def setup_arg_parser() -> argparse.ArgumentParser:
"""Set up command line argument parser."""
parser = argparse.ArgumentParser(description="Process documents using docling-serve API")
parser.add_argument("--api-endpoint", required=True, help="Docling API endpoint URL")
parser.add_argument("--request-json", required=True, help="Path to JSON file with request data")
parser.add_argument("--output-dir", required=True, help="Directory to save output files")
parser.add_argument("--output-format", required=True, help="Desired output format")
parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Request timeout in seconds")
return parser
def load_request_data(json_path: str) -> Dict:
"""Load request data from JSON file."""
try:
with open(json_path, 'r') as f:
return json.load(f)
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f"Error loading request data: {e}")
sys.exit(1)
def save_response_diagnostics(response_text: str, status_code: int, headers: Dict, output_dir: str) -> None:
"""Save full response and headers for debugging."""
with open(os.path.join(output_dir, "full_response.txt"), 'w') as f:
f.write(f"Status code: {status_code}\n")
f.write(f"Headers: {headers}\n\n")
# Truncate very long responses
if len(response_text) > 10000:
f.write(f"Content: {response_text[:10000]}...")
else:
f.write(f"Content: {response_text}")
def save_response_json(response_text: str, output_dir: str) -> None:
"""Save raw response JSON."""
with open(os.path.join(output_dir, "response.json"), 'w') as f:
f.write(response_text)
def save_structure_info(data: Dict, output_dir: str) -> None:
"""Save detailed information about response structure."""
with open(os.path.join(output_dir, "response_structure.txt"), 'w') as f:
f.write(f'Response keys: {list(data.keys())}\n')
# Document content details
if 'document' in data:
doc = data['document'] or {}
f.write(f'Document keys: {list(doc.keys() if doc else [])}\n')
# Check specific content fields
for content_type in ['html_content', 'md_content', 'text_content', 'json_content']:
if content_type in doc and doc[content_type]:
f.write(f'{content_type.replace("_content", "").upper()} content length: {len(doc[content_type])}\n')
elif content_type in doc:
f.write(f'{content_type.replace("_content", "").upper()} content is present but empty or null\n')
# Output structure details
if 'outputs' in data:
f.write(f'Outputs count: {len(data["outputs"])}\n')
if data['outputs']:
output = data['outputs'][0]
f.write(f'First output keys: {list(output.keys())}\n')
if 'files' in output:
f.write(f'Files count: {len(output["files"])}\n')
if output['files']:
file_data = output['files'][0]
f.write(f'First file keys: {list(file_data.keys())}\n')
if 'content' in file_data:
content_length = len(file_data['content'])
f.write(f'Content length: {content_length}\n')
def extract_content_from_file_output(data: Dict, output_format: str, output_dir: str) -> bool:
"""Extract content from 'files' output format."""
if 'outputs' not in data or not data['outputs']:
print('No outputs found in response')
return False
output = data['outputs'][0]
if 'files' not in output or not output['files']:
print('No files found in output')
print(f'Available fields: {list(output.keys())}')
return False
file_data = output['files'][0]
if 'content' not in file_data or not file_data['content']:
if 'content' in file_data:
print('Content field exists but is empty')
else:
print('No content field in file data')
print(f'Available fields: {list(file_data.keys())}')
return False
# Content found, save it
content = file_data['content']
print(f'Found content in file (length: {len(content)})')
with open(os.path.join(output_dir, f"output.{output_format}"), 'w') as f:
f.write(content)
print('CONVERSION SUCCESS')
return True
def extract_content_from_document(data: Dict, output_format: str, output_dir: str) -> bool:
"""Extract content from 'document' response format."""
if 'document' not in data or data.get('status') != 'success':
print('No document field or success status found in response')
return False
document = data['document'] or {}
# Check available formats
available_formats = []
for fmt in ['html', 'md', 'text', 'json']:
content_field = f'{fmt}_content'
if content_field in document and document[content_field]:
available_formats.append((fmt, document[content_field]))
if not available_formats:
# Check for empty fields
empty_fields = []
for fmt in ['html', 'md', 'text', 'json']:
content_field = f'{fmt}_content'
if content_field in document and not document[content_field]:
empty_fields.append(content_field)
if empty_fields:
print(f'Found content fields but they are empty or null: {empty_fields}')
else:
print('No content fields found in document')
print(f'Available fields in document: {list(document.keys() if document else [])}')
return False
# Found available formats
print(f'Found {len(available_formats)} available formats: {[f[0] for f in available_formats]}')
# First try to find exact requested format
requested_format_match = next((f for f in available_formats if f[0] == output_format.lower()), None)
if requested_format_match:
format_type, content = requested_format_match
print(f'Found content in requested format {format_type} (length: {len(content)})')
else:
# If requested format not found, use the first available
format_type, content = available_formats[0]
print(f'Requested format not found, using alternative format {format_type} (length: {len(content)})')
# Save with the matched format's extension
with open(os.path.join(output_dir, f"output.{format_type}"), 'w') as f:
f.write(content)
# If we're using a different format than requested, also save with requested extension
if format_type != output_format.lower():
print(f'Saving content with requested extension {format_type} -> {output_format}')
with open(os.path.join(output_dir, f"output.{output_format}"), 'w') as f:
f.write(content)
print('CONVERSION SUCCESS')
return True
def process_success_response(response_text: str, output_format: str, output_dir: str) -> bool:
"""Process a successful response and extract document content."""
try:
# Save raw response
save_response_json(response_text, output_dir)
# Parse JSON
data = json.loads(response_text)
print('Successfully parsed response as JSON')
# Save detailed structure info
save_structure_info(data, output_dir)
# Try both response formats
if extract_content_from_file_output(data, output_format, output_dir):
return True
if extract_content_from_document(data, output_format, output_dir):
return True
# Check for metadata
if 'metadata' in data:
print('Metadata found in response, saving to file')
with open(os.path.join(output_dir, "metadata.json"), 'w') as f:
json.dump(data['metadata'], f, indent=2)
print('CONVERSION PARTIAL - Some data available but not complete')
return False
except Exception as json_error:
print(f'Failed to parse response as JSON: {json_error}')
traceback.print_exc()
# Save raw content as text if JSON parsing fails
with open(os.path.join(output_dir, "output.txt"), 'w') as f:
f.write(response_text)
print('Saved raw response as text file')
print('CONVERSION PARTIAL - Raw response saved')
return False
def process_requests_api(api_endpoint: str, request_data: Dict, output_format: str, output_dir: str, timeout: int) -> bool:
"""Process using requests library."""
try:
import requests
print('Using requests library for API call')
# Record start time for timing
start_time = time.time()
print(f'Starting conversion request at {time.strftime("%H:%M:%S")}')
response = requests.post(
api_endpoint,
json=request_data,
timeout=timeout
)
elapsed = time.time() - start_time
print(f'Conversion request completed in {elapsed:.2f} seconds')
print(f'Response status code: {response.status_code}')
# Save response diagnostics
save_response_diagnostics(response.text, response.status_code, dict(response.headers), output_dir)
if response.status_code == 200:
return process_success_response(response.text, output_format, output_dir)
else:
print(f'Error response: {response.text[:500]}')
print('CONVERSION FAILED')
return False
except Exception as e:
print(f'Error during requests API call: {e}')
traceback.print_exc()
print('CONVERSION FAILED')
return False
def process_urllib_api(api_endpoint: str, request_data: Dict, output_format: str, output_dir: str, timeout: int) -> bool:
"""Process using urllib as fallback."""
try:
import urllib.request
import urllib.error
print('Using urllib library for API call')
headers = {'Content-Type': 'application/json'}
req_data = json.dumps(request_data).encode('utf-8')
req = urllib.request.Request(
api_endpoint,
data=req_data,
headers=headers,
method='POST'
)
try:
start_time = time.time()
print(f'Starting conversion request at {time.strftime("%H:%M:%S")}')
with urllib.request.urlopen(req, timeout=timeout) as response:
elapsed = time.time() - start_time
print(f'Conversion request completed in {elapsed:.2f} seconds')
print(f'Response status: {response.status}')
response_text = response.read().decode('utf-8')
save_response_diagnostics(response_text, response.status, dict(response.headers), output_dir)
if response.status == 200:
return process_success_response(response_text, output_format, output_dir)
else:
print(f'Error status: {response.status}')
print('CONVERSION FAILED')
return False
except urllib.error.HTTPError as e:
print(f'HTTP Error: {e.code} - {e.reason}')
print(f'Response body: {e.read().decode("utf-8")[:500]}')
print('CONVERSION FAILED')
return False
except urllib.error.URLError as e:
print(f'URL Error: {e.reason}')
print('CONVERSION FAILED')
return False
except Exception as e:
print(f'Unexpected error during urllib request: {e}')
traceback.print_exc()
print('CONVERSION FAILED')
return False
except Exception as e:
print(f'Error setting up urllib: {e}')
traceback.print_exc()
print('CONVERSION FAILED')
return False
def process_document(api_endpoint: str, request_json_path: str, output_format: str,
output_dir: str, timeout: int) -> bool:
"""Main function to process a document through the docling-serve API."""
try:
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Load request data
request_data = load_request_data(request_json_path)
# Log request info
if 'http_sources' in request_data and request_data['http_sources']:
print(f'Request to convert URL: {request_data["http_sources"][0]["url"]}')
if 'options' in request_data:
options = request_data['options']
if 'to_formats' in options and options['to_formats']:
print(f'Output format: {options["to_formats"][0]}')
if 'ocr' in options:
print(f'OCR enabled: {options["ocr"]}')
# Try requests first, fall back to urllib
try:
return process_requests_api(api_endpoint, request_data, output_format, output_dir, timeout)
except ImportError:
return process_urllib_api(api_endpoint, request_data, output_format, output_dir, timeout)
except Exception as e:
print(f'Error during conversion: {e}')
traceback.print_exc()
print('CONVERSION FAILED')
return False
def main():
"""Main entry point."""
parser = setup_arg_parser()
args = parser.parse_args()
success = process_document(
api_endpoint=args.api_endpoint,
request_json_path=args.request_json,
output_format=args.output_format,
output_dir=args.output_dir,
timeout=args.timeout
)
# Exit with appropriate code
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()