mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
Actor: Documentation update
Signed-off-by: Václav Vančura <commit@vancura.dev> Signed-off-by: Adam Kliment <adam@netmilk.net>
This commit is contained in:
parent
66287e45a5
commit
67e1129365
@ -2,17 +2,18 @@
|
||||
|
||||
# --- Setup Error Handling ---
|
||||
|
||||
# Exit the script if any command fails.
|
||||
trap 'echo "Error on line $LINENO"; exit 1' ERR
|
||||
set -e
|
||||
|
||||
# --- Validate Docling Installation ---
|
||||
# --- Validate Docling installation ---
|
||||
|
||||
if ! command -v docling &> /dev/null; then
|
||||
echo "Error: Docling CLI is not installed or not in PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- Input Parsing ---
|
||||
# --- Input parsing ---
|
||||
|
||||
echo "Parsing actor input..."
|
||||
INPUT=$(apify actor:get-input || { echo "Failed to get input"; exit 1; })
|
||||
@ -21,11 +22,13 @@ DOCUMENT_URL=$(echo "$INPUT" | jq -r '.documentUrl')
|
||||
OUTPUT_FORMAT=$(echo "$INPUT" | jq -r '.outputFormat')
|
||||
OUTPUT_NAME="output_file.$OUTPUT_FORMAT"
|
||||
|
||||
# If no document URL is provided, exit with an error.
|
||||
if [ -z "$DOCUMENT_URL" ]; then
|
||||
echo "Error: Missing document URL. Please provide 'documentUrl' in the input"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# If no output format is specified, default to 'md'.
|
||||
if [ -z "$OUTPUT_FORMAT" ]; then
|
||||
OUTPUT_FORMAT="md"
|
||||
echo "No output format specified. Defaulting to 'md'"
|
||||
@ -40,7 +43,7 @@ case "$OUTPUT_FORMAT" in
|
||||
;;
|
||||
esac
|
||||
|
||||
# --- Build Docling Command ---
|
||||
# --- Build Docling command ---
|
||||
|
||||
DOC_CONVERT_CMD="docling --verbose $DOCUMENT_URL --to $OUTPUT_FORMAT"
|
||||
|
||||
@ -48,11 +51,12 @@ if [ "$(echo "$INPUT" | jq -r '.ocr')" = "true" ]; then
|
||||
DOC_CONVERT_CMD="$DOC_CONVERT_CMD --ocr"
|
||||
fi
|
||||
|
||||
# --- Process Document with Docling ---
|
||||
# --- Process document with Docling ---
|
||||
|
||||
echo "Processing document with Docling CLI..."
|
||||
echo "Running: $DOC_CONVERT_CMD"
|
||||
|
||||
# Create a timestamp file to ensure the document is processed only once.
|
||||
touch docling.timestamp
|
||||
|
||||
$DOC_CONVERT_CMD > docling.log 2>&1 || {
|
||||
@ -70,13 +74,15 @@ fi
|
||||
|
||||
mv "$GENERATED_FILE" "$OUTPUT_NAME"
|
||||
|
||||
# --- Validate Output ---
|
||||
# --- Validate output ---
|
||||
|
||||
# If the output file is not found, exit with an error.
|
||||
if [ ! -f "$OUTPUT_NAME" ]; then
|
||||
echo "Error: Expected output file '$OUTPUT_NAME' was not generated"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# If the output file is empty, exit with an error.
|
||||
if [ ! -s "$OUTPUT_NAME" ]; then
|
||||
echo "Error: Generated output file '$OUTPUT_NAME' is empty"
|
||||
exit 1
|
||||
@ -84,7 +90,7 @@ fi
|
||||
|
||||
echo "Document successfully processed and exported as '$OUTPUT_FORMAT' to file: $OUTPUT_NAME"
|
||||
|
||||
# --- Store Output in Key-Value Store ---
|
||||
# --- Store output and log in Key-Value Store ---
|
||||
|
||||
echo "Pushing processed document to Key-Value Store (record key: OUTPUT_RESULT)..."
|
||||
apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" < "$OUTPUT_NAME" || {
|
||||
@ -92,7 +98,7 @@ apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# --- Cleanup Temporary Files ---
|
||||
# --- Cleanup temporary files ---
|
||||
|
||||
rm -f docling.timestamp docling.log || true
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user