mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Backend error handling fixes
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
304d16029a
commit
025983f07b
@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
||||
|
||||
if not success:
|
||||
raise RuntimeError(
|
||||
f"docling-parse could not load document {document_hash}."
|
||||
f"docling-parse could not load document with hash {document_hash}."
|
||||
)
|
||||
|
||||
def page_count(self) -> int:
|
||||
|
@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
print("About to init HTML backend...")
|
||||
_log.debug("About to init HTML backend...")
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
self.soup = None
|
||||
# HTML file:
|
||||
@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = byte_stream.getvalue().decode("utf-8")
|
||||
print(text_stream)
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.soup = BeautifulSoup(text_stream, "html.parser")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||
except Exception as e:
|
||||
_log.error("could not parse html: {}".format(e))
|
||||
return doc
|
||||
raise RuntimeError(
|
||||
f"Could not initialize HTML backend for file with hash {document_hash}."
|
||||
) from e
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def convert(self) -> DoclingDocument:
|
||||
# access self.path_or_stream to load stuff
|
||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||
print("Trying to convert HTML...")
|
||||
_log.debug("Trying to convert HTML...")
|
||||
# Replace <br> tags with newline characters
|
||||
for br in self.soup.body.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def analyse_element(self, element, idx, doc):
|
||||
"""
|
||||
if element.name!=None:
|
||||
print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||
"""
|
||||
|
||||
if element.name in self.labels:
|
||||
@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
doc.add_table(data=data, parent=self.parents[self.level])
|
||||
|
||||
def get_list_text(list_element, level=0):
|
||||
def get_list_text(self, list_element, level=0):
|
||||
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||
result = []
|
||||
bullet_char = "*" # Default bullet character for unordered lists
|
||||
@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Handle nested lists
|
||||
nested_list = li.find(["ul", "ol"])
|
||||
if nested_list:
|
||||
result.extend(get_list_text(nested_list, level + 1))
|
||||
result.extend(self.get_list_text(nested_list, level + 1))
|
||||
elif list_element.name == "ul": # For unordered lists, use bullet points
|
||||
for li in list_element.find_all("li", recursive=False):
|
||||
# Add bullet points for unordered lists
|
||||
@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Handle nested lists
|
||||
nested_list = li.find(["ul", "ol"])
|
||||
if nested_list:
|
||||
result.extend(get_list_text(nested_list, level + 1))
|
||||
result.extend(self.get_list_text(nested_list, level + 1))
|
||||
|
||||
return result
|
||||
|
||||
|
@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
self.path_or_stream = path_or_stream
|
||||
|
||||
self.pptx_obj = None
|
||||
self.valid = True
|
||||
self.valid = False
|
||||
try:
|
||||
self.pptx_obj = Presentation(self.path_or_stream)
|
||||
self.valid = True
|
||||
except Exception:
|
||||
_log.error("could not parse pptx")
|
||||
self.valid = False
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
|
||||
) from e
|
||||
|
||||
return
|
||||
|
||||
|
@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# self.initialise(path_or_stream)
|
||||
# Word file:
|
||||
self.path_or_stream = path_or_stream
|
||||
self.valid = False
|
||||
# Initialise the parents for the hierarchy
|
||||
self.max_levels = 10
|
||||
self.level_at_new_list = None
|
||||
@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
"indents": [None],
|
||||
}
|
||||
|
||||
self.docx_obj = None
|
||||
try:
|
||||
self.docx_obj = docx.Document(self.path_or_stream)
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
|
||||
) from e
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def convert(self) -> DoclingDocument:
|
||||
# Parses the DOCX into a structured document model.
|
||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||
docx_obj = None
|
||||
try:
|
||||
docx_obj = docx.Document(self.path_or_stream)
|
||||
except Exception:
|
||||
_log.error("could not parse docx")
|
||||
return doc
|
||||
|
||||
# self.initialise()
|
||||
doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
|
||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||
return doc
|
||||
|
||||
def update_history(self, name, level, numid, ilevel):
|
||||
|
@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
except PdfiumError as e:
|
||||
raise RuntimeError(
|
||||
f"pypdfium could not load document {document_hash}"
|
||||
f"pypdfium could not load document with hash {document_hash}"
|
||||
) from e
|
||||
|
||||
def page_count(self) -> int:
|
||||
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import re
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
|
||||
)
|
||||
|
||||
def _guess_format(self, obj):
|
||||
content = None
|
||||
if isinstance(obj, Path):
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
elif isinstance(obj, DocumentStream):
|
||||
mime = filetype.guess_mime(obj.stream.read(8192))
|
||||
if mime is None:
|
||||
# TODO improve this.
|
||||
if mime is None:
|
||||
with obj.open("rb") as f:
|
||||
content = f.read(1024) # Read first 1KB
|
||||
|
||||
if obj.suffix == ".html":
|
||||
mime = "text/html"
|
||||
elif isinstance(obj, DocumentStream):
|
||||
obj.stream.seek(0)
|
||||
content = obj.stream.read(8192)
|
||||
obj.stream.seek(0)
|
||||
mime = filetype.guess_mime(content)
|
||||
|
||||
if mime is None:
|
||||
mime = self._detect_html_xhtml(content)
|
||||
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
return format
|
||||
|
||||
def _detect_html_xhtml(self, content):
|
||||
content_str = content.decode("ascii", errors="ignore").lower()
|
||||
# Remove XML comments
|
||||
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
||||
content_str = content_str.lstrip()
|
||||
|
||||
if re.match(r"<\?xml", content_str):
|
||||
if "xhtml" in content_str[:1000]:
|
||||
return "application/xhtml+xml"
|
||||
|
||||
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
||||
return "text/html"
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||
paths = [Path(p) for p in paths]
|
||||
|
@ -120,7 +120,7 @@ def main():
|
||||
]
|
||||
|
||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||
# docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
|
||||
# input = DocumentConversionInput.from_streams(docs)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
@ -38,6 +38,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
||||
InputFormat.PDF,
|
||||
# InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
], # whitelist formats, other files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
|
Loading…
Reference in New Issue
Block a user