From 025983f07bce1d3a6399ce8026653c8ce523b836 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 11 Oct 2024 11:18:47 +0200 Subject: [PATCH] Backend error handling fixes Signed-off-by: Christoph Auer --- docling/backend/docling_parse_backend.py | 2 +- docling/backend/html_backend.py | 20 +++++++------- docling/backend/mspowerpoint_backend.py | 8 +++--- docling/backend/msword_backend.py | 18 ++++++++----- docling/backend/pypdfium2_backend.py | 2 +- docling/datamodel/document.py | 34 +++++++++++++++++++----- examples/batch_convert.py | 2 +- examples/run_with_formats.py | 1 + 8 files changed, 58 insertions(+), 29 deletions(-) diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 7d5c3113..e3e2293e 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend): if not success: raise RuntimeError( - f"docling-parse could not load document {document_hash}." + f"docling-parse could not load document with hash {document_hash}." ) def page_count(self) -> int: diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 2ab27095..cae81085 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -21,7 +21,7 @@ _log = logging.getLogger(__name__) class HTMLDocumentBackend(DeclarativeDocumentBackend): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): - print("About to init HTML backend...") + _log.debug("About to init HTML backend...") super().__init__(path_or_stream, document_hash) self.soup = None # HTML file: @@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): try: if isinstance(self.path_or_stream, BytesIO): - text_stream = byte_stream.getvalue().decode("utf-8") - print(text_stream) + text_stream = self.path_or_stream.getvalue().decode("utf-8") self.soup = BeautifulSoup(text_stream, "html.parser") if isinstance(self.path_or_stream, Path): with open(self.path_or_stream, "r", encoding="utf-8") as f: html_content = f.read() self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: - _log.error("could not parse html: {}".format(e)) - return doc + raise RuntimeError( + f"Could not initialize HTML backend for file with hash {document_hash}." + ) from e def is_valid(self) -> bool: return True @@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: # access self.path_or_stream to load stuff doc = DoclingDocument(description=DescriptionItem(), name="dummy") - print("Trying to convert HTML...") + _log.debug("Trying to convert HTML...") # Replace
tags with newline characters for br in self.soup.body.find_all("br"): br.replace_with("\n") @@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def analyse_element(self, element, idx, doc): """ if element.name!=None: - print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") + _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") """ if element.name in self.labels: @@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=data, parent=self.parents[self.level]) - def get_list_text(list_element, level=0): + def get_list_text(self, list_element, level=0): """Recursively extract text from