diff --git a/.actor/actor.sh b/.actor/actor.sh index 95ee9ab2..b825b766 100755 --- a/.actor/actor.sh +++ b/.actor/actor.sh @@ -8,7 +8,8 @@ set -e # --- Validate Docling installation --- -if ! command -v docling &> /dev/null; then +# Check if Docling CLI is installed and in PATH. +if ! command -v docling &>/dev/null; then echo "Error: Docling CLI is not installed or not in PATH" exit 1 fi @@ -16,11 +17,15 @@ fi # --- Input parsing --- echo "Parsing actor input..." -INPUT=$(apify actor:get-input || { echo "Failed to get input"; exit 1; }) -DOCUMENT_URL=$(echo "$INPUT" | jq -r '.documentUrl') -OUTPUT_FORMAT=$(echo "$INPUT" | jq -r '.outputFormat') -OUTPUT_NAME="output_file.$OUTPUT_FORMAT" +INPUT="$(apify actor:get-input || { + echo "Failed to get input" + exit 1 +})" + +DOCUMENT_URL="$(echo "${INPUT}" | jq -r '.documentUrl')" +OUTPUT_FORMAT="$(echo "${INPUT}" | jq -r '.outputFormat')" +OUTPUT_NAME="output_file.${OUTPUT_FORMAT}" # If no document URL is provided, exit with an error. if [ -z "$DOCUMENT_URL" ]; then @@ -34,21 +39,21 @@ if [ -z "$OUTPUT_FORMAT" ]; then echo "No output format specified. Defaulting to 'md'" fi -case "$OUTPUT_FORMAT" in - md|json|html|text|doctags) - ;; - *) - echo "Error: Invalid output format '$OUTPUT_FORMAT'. Supported formats are 'md', 'json', 'html', 'text', and 'doctags'" - exit 1 - ;; +# Validate the output format. +case "$OUTPUT_FORMAT" in md | json | html | text | doctags) ;; +*) + echo "Error: Invalid output format '$OUTPUT_FORMAT'. Supported formats are 'md', 'json', 'html', 'text', and 'doctags'" + exit 1 + ;; esac # --- Build Docling command --- -DOC_CONVERT_CMD="docling --verbose $DOCUMENT_URL --to $OUTPUT_FORMAT" +DOC_CONVERT_CMD="docling --verbose \"${DOCUMENT_URL}\" --to \"${OUTPUT_FORMAT}\"" -if [ "$(echo "$INPUT" | jq -r '.ocr')" = "true" ]; then - DOC_CONVERT_CMD="$DOC_CONVERT_CMD --ocr" +# If OCR is enabled, add the OCR flag to the command. +if [ "$(echo "${INPUT}" | jq -r '.ocr')" = "true" ]; then + DOC_CONVERT_CMD="${DOC_CONVERT_CMD} --ocr" fi # --- Process document with Docling --- @@ -57,22 +62,27 @@ echo "Processing document with Docling CLI..." echo "Running: $DOC_CONVERT_CMD" # Create a timestamp file to ensure the document is processed only once. -touch docling.timestamp - -$DOC_CONVERT_CMD > docling.log 2>&1 || { - echo "Error: Docling CLI failed. Check 'docling.log' for details"; - cat docling.log; - exit 1; +touch docling.timestamp || { + echo "Error: Failed to create timestamp file" + exit 1 } -GENERATED_FILE=$(find . -type f -name "*.$OUTPUT_FORMAT" -newer docling.timestamp) +# Execute the command and capture both stdout and stderr. +eval "$DOC_CONVERT_CMD" >docling.log 2>&1 || { + cat docling.log + echo "Error: Docling command failed" + exit 1 +} +GENERATED_FILE="$(find . -type f -name "*.${OUTPUT_FORMAT}" -newer docling.timestamp)" + +# If no generated file is found, exit with an error. if [ -z "$GENERATED_FILE" ]; then echo "Error: Could not find generated output file with extension .$OUTPUT_FORMAT" exit 1 fi -mv "$GENERATED_FILE" "$OUTPUT_NAME" +mv "${GENERATED_FILE}" "${OUTPUT_NAME}" # --- Validate output --- @@ -93,13 +103,32 @@ echo "Document successfully processed and exported as '$OUTPUT_FORMAT' to file: # --- Store output and log in Key-Value Store --- echo "Pushing processed document to Key-Value Store (record key: OUTPUT_RESULT)..." -apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" < "$OUTPUT_NAME" || { +apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" <"$OUTPUT_NAME" || { echo "Error: Failed to push the output document to the Key-Value Store" exit 1 } +if [ -f "docling.log" ]; then + if [ -s "docling.log" ]; then + echo "Log file is not empty, pushing to Key-Value Store (record key: DOCLING_LOG)..." + apify actor:set-value "DOCLING_LOG" --contentType "text/plain" <"docling.log" || { + echo "Warning: Failed to push the log file to the Key-Value Store" + } + else + echo "Warning: docling.log file exists but is empty" + fi +else + echo "Warning: No docling.log file found" +fi + # --- Cleanup temporary files --- -rm -f docling.timestamp docling.log || true +cleanup() { + local exit_code=$? + rm -f docling.timestamp docling.log || true + exit $exit_code +} + +trap cleanup EXIT echo "Done!"