mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 22:14:37 +00:00
Actor: Improve shell script robustness and error handling
The shell script has been enhanced with better error handling, input validation, and cleanup procedures. Key improvements include: - Added proper quoting around variables to prevent word splitting. - Improved error messages and logging functionality. - Implemented a cleanup trap to ensure temporary files are removed. - Enhanced validation of input parameters and output formats. - Added better handling of the log file and its storage. - Improved command execution with proper evaluation. - Added comments for better code readability and maintenance. - Fixed potential security issues with proper variable expansion. Signed-off-by: Václav Vančura <commit@vancura.dev>
This commit is contained in:
parent
dde401d134
commit
ff7d64b421
@ -8,7 +8,8 @@ set -e
|
||||
|
||||
# --- Validate Docling installation ---
|
||||
|
||||
if ! command -v docling &> /dev/null; then
|
||||
# Check if Docling CLI is installed and in PATH.
|
||||
if ! command -v docling &>/dev/null; then
|
||||
echo "Error: Docling CLI is not installed or not in PATH"
|
||||
exit 1
|
||||
fi
|
||||
@ -16,11 +17,15 @@ fi
|
||||
# --- Input parsing ---
|
||||
|
||||
echo "Parsing actor input..."
|
||||
INPUT=$(apify actor:get-input || { echo "Failed to get input"; exit 1; })
|
||||
|
||||
DOCUMENT_URL=$(echo "$INPUT" | jq -r '.documentUrl')
|
||||
OUTPUT_FORMAT=$(echo "$INPUT" | jq -r '.outputFormat')
|
||||
OUTPUT_NAME="output_file.$OUTPUT_FORMAT"
|
||||
INPUT="$(apify actor:get-input || {
|
||||
echo "Failed to get input"
|
||||
exit 1
|
||||
})"
|
||||
|
||||
DOCUMENT_URL="$(echo "${INPUT}" | jq -r '.documentUrl')"
|
||||
OUTPUT_FORMAT="$(echo "${INPUT}" | jq -r '.outputFormat')"
|
||||
OUTPUT_NAME="output_file.${OUTPUT_FORMAT}"
|
||||
|
||||
# If no document URL is provided, exit with an error.
|
||||
if [ -z "$DOCUMENT_URL" ]; then
|
||||
@ -34,21 +39,21 @@ if [ -z "$OUTPUT_FORMAT" ]; then
|
||||
echo "No output format specified. Defaulting to 'md'"
|
||||
fi
|
||||
|
||||
case "$OUTPUT_FORMAT" in
|
||||
md|json|html|text|doctags)
|
||||
;;
|
||||
*)
|
||||
echo "Error: Invalid output format '$OUTPUT_FORMAT'. Supported formats are 'md', 'json', 'html', 'text', and 'doctags'"
|
||||
exit 1
|
||||
;;
|
||||
# Validate the output format.
|
||||
case "$OUTPUT_FORMAT" in md | json | html | text | doctags) ;;
|
||||
*)
|
||||
echo "Error: Invalid output format '$OUTPUT_FORMAT'. Supported formats are 'md', 'json', 'html', 'text', and 'doctags'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# --- Build Docling command ---
|
||||
|
||||
DOC_CONVERT_CMD="docling --verbose $DOCUMENT_URL --to $OUTPUT_FORMAT"
|
||||
DOC_CONVERT_CMD="docling --verbose \"${DOCUMENT_URL}\" --to \"${OUTPUT_FORMAT}\""
|
||||
|
||||
if [ "$(echo "$INPUT" | jq -r '.ocr')" = "true" ]; then
|
||||
DOC_CONVERT_CMD="$DOC_CONVERT_CMD --ocr"
|
||||
# If OCR is enabled, add the OCR flag to the command.
|
||||
if [ "$(echo "${INPUT}" | jq -r '.ocr')" = "true" ]; then
|
||||
DOC_CONVERT_CMD="${DOC_CONVERT_CMD} --ocr"
|
||||
fi
|
||||
|
||||
# --- Process document with Docling ---
|
||||
@ -57,22 +62,27 @@ echo "Processing document with Docling CLI..."
|
||||
echo "Running: $DOC_CONVERT_CMD"
|
||||
|
||||
# Create a timestamp file to ensure the document is processed only once.
|
||||
touch docling.timestamp
|
||||
|
||||
$DOC_CONVERT_CMD > docling.log 2>&1 || {
|
||||
echo "Error: Docling CLI failed. Check 'docling.log' for details";
|
||||
cat docling.log;
|
||||
exit 1;
|
||||
touch docling.timestamp || {
|
||||
echo "Error: Failed to create timestamp file"
|
||||
exit 1
|
||||
}
|
||||
|
||||
GENERATED_FILE=$(find . -type f -name "*.$OUTPUT_FORMAT" -newer docling.timestamp)
|
||||
# Execute the command and capture both stdout and stderr.
|
||||
eval "$DOC_CONVERT_CMD" >docling.log 2>&1 || {
|
||||
cat docling.log
|
||||
echo "Error: Docling command failed"
|
||||
exit 1
|
||||
}
|
||||
|
||||
GENERATED_FILE="$(find . -type f -name "*.${OUTPUT_FORMAT}" -newer docling.timestamp)"
|
||||
|
||||
# If no generated file is found, exit with an error.
|
||||
if [ -z "$GENERATED_FILE" ]; then
|
||||
echo "Error: Could not find generated output file with extension .$OUTPUT_FORMAT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mv "$GENERATED_FILE" "$OUTPUT_NAME"
|
||||
mv "${GENERATED_FILE}" "${OUTPUT_NAME}"
|
||||
|
||||
# --- Validate output ---
|
||||
|
||||
@ -93,13 +103,32 @@ echo "Document successfully processed and exported as '$OUTPUT_FORMAT' to file:
|
||||
# --- Store output and log in Key-Value Store ---
|
||||
|
||||
echo "Pushing processed document to Key-Value Store (record key: OUTPUT_RESULT)..."
|
||||
apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" < "$OUTPUT_NAME" || {
|
||||
apify actor:set-value "OUTPUT_RESULT" --contentType "application/$OUTPUT_FORMAT" <"$OUTPUT_NAME" || {
|
||||
echo "Error: Failed to push the output document to the Key-Value Store"
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ -f "docling.log" ]; then
|
||||
if [ -s "docling.log" ]; then
|
||||
echo "Log file is not empty, pushing to Key-Value Store (record key: DOCLING_LOG)..."
|
||||
apify actor:set-value "DOCLING_LOG" --contentType "text/plain" <"docling.log" || {
|
||||
echo "Warning: Failed to push the log file to the Key-Value Store"
|
||||
}
|
||||
else
|
||||
echo "Warning: docling.log file exists but is empty"
|
||||
fi
|
||||
else
|
||||
echo "Warning: No docling.log file found"
|
||||
fi
|
||||
|
||||
# --- Cleanup temporary files ---
|
||||
|
||||
rm -f docling.timestamp docling.log || true
|
||||
cleanup() {
|
||||
local exit_code=$?
|
||||
rm -f docling.timestamp docling.log || true
|
||||
exit $exit_code
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
echo "Done!"
|
||||
|
Loading…
Reference in New Issue
Block a user