Improved backends

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-10-08 16:37:47 +02:00
parent 1d55cbdca9
commit 07d952acf9
3 changed files with 33 additions and 11 deletions

View File

@ -1,5 +1,5 @@
import logging
from io import BytesIO
from io import BytesIO, TextIOWrapper
from pathlib import Path
from typing import Set, Union
@ -27,6 +27,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
print("About to init HTML backend...")
super().__init__(path_or_stream, document_hash)
self.soup = None
# HTML file:
@ -39,6 +40,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[i] = None
self.labels = {}
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = byte_stream.getvalue().decode("utf-8")
print(text_stream)
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
_log.error("could not parse html: {}".format(e))
return doc
def is_valid(self) -> bool:
return True
@ -58,15 +72,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
try:
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
_log.error("could not parse html: {}".format(e))
return doc
print("Trying to convert HTML...")
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")

View File

@ -78,8 +78,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
# Parses the PPTX into a structured document model.
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=self.path_or_stream.name,
filename=fname,
mimetype="application/vnd.ms-powerpoint",
binary_hash=self.document_hash,
)

View File

@ -1,5 +1,6 @@
import json
import logging
from io import BytesIO, TextIOWrapper
from pathlib import Path
from typing import Iterable
@ -9,6 +10,7 @@ from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
InputFormat,
PdfPipelineOptions,
PipelineOptions,
@ -29,6 +31,16 @@ input_paths = [
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2206.01062.pdf"),
]
input_bytes = []
for p in input_paths:
buf = BytesIO(p.open("rb").read())
# tstream = TextIOWrapper(buf, encoding='utf-8')
# input_bytes.append(tstream)
bstream = DocumentStream(filename=p.name, stream=buf)
input_bytes.append(bstream)
# input = DocumentConversionInput.from_streams(input_bytes)
input = DocumentConversionInput.from_paths(input_paths)
# for defaults use: