diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py
index 7d5c3113..e3e2293e 100644
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
if not success:
raise RuntimeError(
- f"docling-parse could not load document {document_hash}."
+ f"docling-parse could not load document with hash {document_hash}."
)
def page_count(self) -> int:
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 2ab27095..cae81085 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
- print("About to init HTML backend...")
+ _log.debug("About to init HTML backend...")
super().__init__(path_or_stream, document_hash)
self.soup = None
# HTML file:
@@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try:
if isinstance(self.path_or_stream, BytesIO):
- text_stream = byte_stream.getvalue().decode("utf-8")
- print(text_stream)
+ text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
- _log.error("could not parse html: {}".format(e))
- return doc
+ raise RuntimeError(
+ f"Could not initialize HTML backend for file with hash {document_hash}."
+ ) from e
def is_valid(self) -> bool:
return True
@@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
- print("Trying to convert HTML...")
+ _log.debug("Trying to convert HTML...")
# Replace
tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")
@@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def analyse_element(self, element, idx, doc):
"""
if element.name!=None:
- print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+ _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
"""
if element.name in self.labels:
@@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=data, parent=self.parents[self.level])
- def get_list_text(list_element, level=0):
+ def get_list_text(self, list_element, level=0):
"""Recursively extract text from
or with proper indentation."""
result = []
bullet_char = "*" # Default bullet character for unordered lists
@@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
- result.extend(get_list_text(nested_list, level + 1))
+ result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points
for li in list_element.find_all("li", recursive=False):
# Add bullet points for unordered lists
@@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
- result.extend(get_list_text(nested_list, level + 1))
+ result.extend(self.get_list_text(nested_list, level + 1))
return result
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index f5a635ee..7703e3b1 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.path_or_stream = path_or_stream
self.pptx_obj = None
- self.valid = True
+ self.valid = False
try:
self.pptx_obj = Presentation(self.path_or_stream)
+ self.valid = True
except Exception:
- _log.error("could not parse pptx")
- self.valid = False
+ raise RuntimeError(
+ f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
+ ) from e
return
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index fbde6869..49911009 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# self.initialise(path_or_stream)
# Word file:
self.path_or_stream = path_or_stream
+ self.valid = False
# Initialise the parents for the hierarchy
self.max_levels = 10
self.level_at_new_list = None
@@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"indents": [None],
}
+ self.docx_obj = None
+ try:
+ self.docx_obj = docx.Document(self.path_or_stream)
+ self.valid = True
+ except Exception as e:
+ raise RuntimeError(
+ f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
+ ) from e
+
def is_valid(self) -> bool:
return True
@@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model.
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
- docx_obj = None
- try:
- docx_obj = docx.Document(self.path_or_stream)
- except Exception:
- _log.error("could not parse docx")
- return doc
# self.initialise()
- doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
+ doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc
def update_history(self, name, level, numid, ilevel):
diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py
index e7d7ae84..e4c6e423 100644
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
self._pdoc = pdfium.PdfDocument(path_or_stream)
except PdfiumError as e:
raise RuntimeError(
- f"pypdfium could not load document {document_hash}"
+ f"pypdfium could not load document with hash {document_hash}"
) from e
def page_count(self) -> int:
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index b7c020f2..721ccfcd 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -1,4 +1,5 @@
import logging
+import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
@@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
)
def _guess_format(self, obj):
+ content = None
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
- elif isinstance(obj, DocumentStream):
- mime = filetype.guess_mime(obj.stream.read(8192))
- if mime is None:
- # TODO improve this.
+ if mime is None:
+ with obj.open("rb") as f:
+ content = f.read(1024) # Read first 1KB
- if obj.suffix == ".html":
- mime = "text/html"
+ elif isinstance(obj, DocumentStream):
+ obj.stream.seek(0)
+ content = obj.stream.read(8192)
+ obj.stream.seek(0)
+ mime = filetype.guess_mime(content)
+
+ if mime is None:
+ mime = self._detect_html_xhtml(content)
format = MimeTypeToFormat.get(mime)
return format
+ def _detect_html_xhtml(self, content):
+ content_str = content.decode("ascii", errors="ignore").lower()
+ # Remove XML comments
+ content_str = re.sub(r"", "", content_str, flags=re.DOTALL)
+ content_str = content_str.lstrip()
+
+ if re.match(r"<\?xml", content_str):
+ if "xhtml" in content_str[:1000]:
+ return "application/xhtml+xml"
+
+ if re.match(r"