From 07d952acf97ecab216fc397d3a810f641c8a07f2 Mon Sep 17 00:00:00 2001 From: Maxim Lysak Date: Tue, 8 Oct 2024 16:37:47 +0200 Subject: [PATCH] Improved backends Signed-off-by: Maxim Lysak --- docling/backend/html_backend.py | 26 +++++++++++++++---------- docling/backend/mspowerpoint_backend.py | 6 +++++- examples/run_with_formats.py | 12 ++++++++++++ 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index c0315aaf..bd6aff56 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,5 +1,5 @@ import logging -from io import BytesIO +from io import BytesIO, TextIOWrapper from pathlib import Path from typing import Set, Union @@ -27,6 +27,7 @@ _log = logging.getLogger(__name__) class HTMLDocumentBackend(DeclarativeDocumentBackend): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + print("About to init HTML backend...") super().__init__(path_or_stream, document_hash) self.soup = None # HTML file: @@ -39,6 +40,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[i] = None self.labels = {} + try: + if isinstance(self.path_or_stream, BytesIO): + text_stream = byte_stream.getvalue().decode("utf-8") + print(text_stream) + self.soup = BeautifulSoup(text_stream, "html.parser") + if isinstance(self.path_or_stream, Path): + with open(self.path_or_stream, "r", encoding="utf-8") as f: + html_content = f.read() + self.soup = BeautifulSoup(html_content, "html.parser") + except Exception as e: + _log.error("could not parse html: {}".format(e)) + return doc + def is_valid(self) -> bool: return True @@ -58,15 +72,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: # access self.path_or_stream to load stuff doc = DoclingDocument(description=DescriptionItem(), name="dummy") - - try: - with open(self.path_or_stream, "r", encoding="utf-8") as f: - html_content = f.read() - self.soup = BeautifulSoup(html_content, "html.parser") - except Exception as e: - _log.error("could not parse html: {}".format(e)) - return doc - + print("Trying to convert HTML...") # Replace
tags with newline characters for br in self.soup.body.find_all("br"): br.replace_with("\n") diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 49027356..16e0de69 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -78,8 +78,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB # Parses the PPTX into a structured document model. # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash) + fname = "" + if isinstance(self.path_or_stream, Path): + fname = self.path_or_stream.name + origin = DocumentOrigin( - filename=self.path_or_stream.name, + filename=fname, mimetype="application/vnd.ms-powerpoint", binary_hash=self.document_hash, ) diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index f143c9f8..573c58b1 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -1,5 +1,6 @@ import json import logging +from io import BytesIO, TextIOWrapper from pathlib import Path from typing import Iterable @@ -9,6 +10,7 @@ from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ( ConversionStatus, + DocumentStream, InputFormat, PdfPipelineOptions, PipelineOptions, @@ -29,6 +31,16 @@ input_paths = [ Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/2206.01062.pdf"), ] + +input_bytes = [] +for p in input_paths: + buf = BytesIO(p.open("rb").read()) + # tstream = TextIOWrapper(buf, encoding='utf-8') + # input_bytes.append(tstream) + bstream = DocumentStream(filename=p.name, stream=buf) + input_bytes.append(bstream) + +# input = DocumentConversionInput.from_streams(input_bytes) input = DocumentConversionInput.from_paths(input_paths) # for defaults use: