From 67e112936508b01fe43f1f31208a41a379decc87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Van=C4=8Dura?= Date: Wed, 22 Jan 2025 11:56:55 +0100 Subject: [PATCH] Actor: Documentation update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Václav Vančura Signed-off-by: Adam Kliment --- .actor/actor.sh | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/.actor/actor.sh b/.actor/actor.sh index cec63373..95ee9ab2 100755 --- a/.actor/actor.sh +++ b/.actor/actor.sh @@ -2,17 +2,18 @@ # --- Setup Error Handling --- +# Exit the script if any command fails. trap 'echo "Error on line $LINENO"; exit 1' ERR set -e -# --- Validate Docling Installation --- +# --- Validate Docling installation --- if ! command -v docling &> /dev/null; then echo "Error: Docling CLI is not installed or not in PATH" exit 1 fi -# --- Input Parsing --- +# --- Input parsing --- echo "Parsing actor input..." INPUT=$(apify actor:get-input || { echo "Failed to get input"; exit 1; }) @@ -21,11 +22,13 @@ DOCUMENT_URL=$(echo "$INPUT" | jq -r '.documentUrl') OUTPUT_FORMAT=$(echo "$INPUT" | jq -r '.outputFormat') OUTPUT_NAME="output_file.$OUTPUT_FORMAT" +# If no document URL is provided, exit with an error. if [ -z "$DOCUMENT_URL" ]; then echo "Error: Missing document URL. Please provide 'documentUrl' in the input" exit 1 fi +# If no output format is specified, default to 'md'. if [ -z "$OUTPUT_FORMAT" ]; then OUTPUT_FORMAT="md" echo "No output format specified. Defaulting to 'md'" @@ -40,7 +43,7 @@ case "$OUTPUT_FORMAT" in ;; esac -# --- Build Docling Command --- +# --- Build Docling command --- DOC_CONVERT_CMD="docling --verbose $DOCUMENT_URL --to $OUTPUT_FORMAT" @@ -48,11 +51,12 @@ if [ "$(echo "$INPUT" | jq -r '.ocr')" = "true" ]; then DOC_CONVERT_CMD="$DOC_CONVERT_CMD --ocr" fi -# --- Process Document with Docling --- +# --- Process document with Docling --- echo "Processing document with Docling CLI..." echo "Running: $DOC_CONVERT_CMD" +# Create a timestamp file to ensure the document is processed only once. touch docling.timestamp $DOC_CONVERT_CMD > docling.log 2>&1 || { @@ -70,13 +74,15 @@ fi mv "$GENERATED_FILE" "$OUTPUT_NAME" -# --- Validate Output --- +# --- Validate output --- +# If the output file is not found, exit with an error. if [ ! -f "$OUTPUT_NAME" ]; then echo "Error: Expected output file '$OUTPUT_NAME' was not generated" exit 1 fi +# If the output file is empty, exit with an error. if [ ! -s "$OUTPUT_NAME" ]; then echo "Error: Generated output file '$OUTPUT_NAME' is empty" exit 1 @@ -84,7 +90,7 @@ fi echo "Document successfully processed and exported as '$OUTPUT_FORMAT' to file: $OUTPUT_NAME" -# --- Store Output in Key-Value Store --- +# --- Store output and log in Key-Value Store --- echo "Pushing processed document to Key-Value Store (record key: OUTPUT_RESULT)..." apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" < "$OUTPUT_NAME" || { @@ -92,7 +98,7 @@ apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" exit 1 } -# --- Cleanup Temporary Files --- +# --- Cleanup temporary files --- rm -f docling.timestamp docling.log || true