Backend error handling fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-11 11:18:47 +02:00
parent 304d16029a
commit 025983f07b
8 changed files with 58 additions and 29 deletions

View File

@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
if not success: if not success:
raise RuntimeError( raise RuntimeError(
f"docling-parse could not load document {document_hash}." f"docling-parse could not load document with hash {document_hash}."
) )
def page_count(self) -> int: def page_count(self) -> int:

View File

@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend): class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
print("About to init HTML backend...") _log.debug("About to init HTML backend...")
super().__init__(path_or_stream, document_hash) super().__init__(path_or_stream, document_hash)
self.soup = None self.soup = None
# HTML file: # HTML file:
@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
text_stream = byte_stream.getvalue().decode("utf-8") text_stream = self.path_or_stream.getvalue().decode("utf-8")
print(text_stream)
self.soup = BeautifulSoup(text_stream, "html.parser") self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path): if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f: with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read() html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser") self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e: except Exception as e:
_log.error("could not parse html: {}".format(e)) raise RuntimeError(
return doc f"Could not initialize HTML backend for file with hash {document_hash}."
) from e
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff # access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(description=DescriptionItem(), name="dummy")
print("Trying to convert HTML...") _log.debug("Trying to convert HTML...")
# Replace <br> tags with newline characters # Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"): for br in self.soup.body.find_all("br"):
br.replace_with("\n") br.replace_with("\n")
@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def analyse_element(self, element, idx, doc): def analyse_element(self, element, idx, doc):
""" """
if element.name!=None: if element.name!=None:
print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
""" """
if element.name in self.labels: if element.name in self.labels:
@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=data, parent=self.parents[self.level]) doc.add_table(data=data, parent=self.parents[self.level])
def get_list_text(list_element, level=0): def get_list_text(self, list_element, level=0):
"""Recursively extract text from <ul> or <ol> with proper indentation.""" """Recursively extract text from <ul> or <ol> with proper indentation."""
result = [] result = []
bullet_char = "*" # Default bullet character for unordered lists bullet_char = "*" # Default bullet character for unordered lists
@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Handle nested lists # Handle nested lists
nested_list = li.find(["ul", "ol"]) nested_list = li.find(["ul", "ol"])
if nested_list: if nested_list:
result.extend(get_list_text(nested_list, level + 1)) result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points elif list_element.name == "ul": # For unordered lists, use bullet points
for li in list_element.find_all("li", recursive=False): for li in list_element.find_all("li", recursive=False):
# Add bullet points for unordered lists # Add bullet points for unordered lists
@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Handle nested lists # Handle nested lists
nested_list = li.find(["ul", "ol"]) nested_list = li.find(["ul", "ol"])
if nested_list: if nested_list:
result.extend(get_list_text(nested_list, level + 1)) result.extend(self.get_list_text(nested_list, level + 1))
return result return result

View File

@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
self.pptx_obj = None self.pptx_obj = None
self.valid = True self.valid = False
try: try:
self.pptx_obj = Presentation(self.path_or_stream) self.pptx_obj = Presentation(self.path_or_stream)
self.valid = True
except Exception: except Exception:
_log.error("could not parse pptx") raise RuntimeError(
self.valid = False f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e
return return

View File

@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# self.initialise(path_or_stream) # self.initialise(path_or_stream)
# Word file: # Word file:
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
self.valid = False
# Initialise the parents for the hierarchy # Initialise the parents for the hierarchy
self.max_levels = 10 self.max_levels = 10
self.level_at_new_list = None self.level_at_new_list = None
@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"indents": [None], "indents": [None],
} }
self.docx_obj = None
try:
self.docx_obj = docx.Document(self.path_or_stream)
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model. # Parses the DOCX into a structured document model.
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(description=DescriptionItem(), name="dummy")
docx_obj = None
try:
docx_obj = docx.Document(self.path_or_stream)
except Exception:
_log.error("could not parse docx")
return doc
# self.initialise() # self.initialise()
doc = self.walk_linear(docx_obj.element.body, docx_obj, doc) doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc return doc
def update_history(self, name, level, numid, ilevel): def update_history(self, name, level, numid, ilevel):

View File

@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(path_or_stream)
except PdfiumError as e: except PdfiumError as e:
raise RuntimeError( raise RuntimeError(
f"pypdfium could not load document {document_hash}" f"pypdfium could not load document with hash {document_hash}"
) from e ) from e
def page_count(self) -> int: def page_count(self) -> int:

View File

@ -1,4 +1,5 @@
import logging import logging
import re
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
) )
def _guess_format(self, obj): def _guess_format(self, obj):
content = None
if isinstance(obj, Path): if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj)) mime = filetype.guess_mime(str(obj))
elif isinstance(obj, DocumentStream): if mime is None:
mime = filetype.guess_mime(obj.stream.read(8192)) with obj.open("rb") as f:
if mime is None: content = f.read(1024) # Read first 1KB
# TODO improve this.
if obj.suffix == ".html": elif isinstance(obj, DocumentStream):
mime = "text/html" obj.stream.seek(0)
content = obj.stream.read(8192)
obj.stream.seek(0)
mime = filetype.guess_mime(content)
if mime is None:
mime = self._detect_html_xhtml(content)
format = MimeTypeToFormat.get(mime) format = MimeTypeToFormat.get(mime)
return format return format
def _detect_html_xhtml(self, content):
content_str = content.decode("ascii", errors="ignore").lower()
# Remove XML comments
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
content_str = content_str.lstrip()
if re.match(r"<\?xml", content_str):
if "xhtml" in content_str[:1000]:
return "application/xhtml+xml"
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
return "text/html"
return None
@classmethod @classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None): def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths] paths = [Path(p) for p in paths]

View File

@ -120,7 +120,7 @@ def main():
] ]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs) # input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter() doc_converter = DocumentConverter()

View File

@ -38,6 +38,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
InputFormat.PDF, InputFormat.PDF,
# InputFormat.IMAGE, # InputFormat.IMAGE,
InputFormat.DOCX, InputFormat.DOCX,
InputFormat.HTML,
], # whitelist formats, other files are ignored. ], # whitelist formats, other files are ignored.
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(