diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 36e02dfe..dc873971 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,5 +1,5 @@ import logging -from io import BytesIO +from io import BytesIO, TextIOWrapper from pathlib import Path from typing import Set, Union @@ -21,6 +21,7 @@ _log = logging.getLogger(__name__) class HTMLDocumentBackend(DeclarativeDocumentBackend): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + print("About to init HTML backend...") super().__init__(path_or_stream, document_hash) self.soup = None # HTML file: @@ -33,6 +34,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[i] = None self.labels = {} + try: + if isinstance(self.path_or_stream, BytesIO): + text_stream = byte_stream.getvalue().decode("utf-8") + print(text_stream) + self.soup = BeautifulSoup(text_stream, "html.parser") + if isinstance(self.path_or_stream, Path): + with open(self.path_or_stream, "r", encoding="utf-8") as f: + html_content = f.read() + self.soup = BeautifulSoup(html_content, "html.parser") + except Exception as e: + _log.error("could not parse html: {}".format(e)) + return doc + def is_valid(self) -> bool: return True @@ -52,15 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: # access self.path_or_stream to load stuff doc = DoclingDocument(description=DescriptionItem(), name="dummy") - - try: - with open(self.path_or_stream, "r", encoding="utf-8") as f: - html_content = f.read() - self.soup = BeautifulSoup(html_content, "html.parser") - except Exception as e: - _log.error("could not parse html: {}".format(e)) - return doc - + print("Trying to convert HTML...") # Replace
tags with newline characters for br in self.soup.body.find_all("br"): br.replace_with("\n") diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 91be4fd8..22eae9d4 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -74,8 +74,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB # Parses the PPTX into a structured document model. # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash) + fname = "" + if isinstance(self.path_or_stream, Path): + fname = self.path_or_stream.name + origin = DocumentOrigin( - filename=self.path_or_stream.name, + filename=fname, mimetype="application/vnd.ms-powerpoint", binary_hash=self.document_hash, ) diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index cdf1c670..aeb254a9 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -18,7 +18,6 @@ input_paths = [ Path("tests/data/word_sample.docx"), Path("tests/data/lorem_ipsum.docx"), Path("tests/data/powerpoint_sample.pptx"), - Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/2206.01062.pdf"), ] input = DocumentConversionInput.from_paths(input_paths)