Merge from upstream

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-08 16:40:55 +02:00
commit 080042d06d
3 changed files with 21 additions and 12 deletions

View File

@ -1,5 +1,5 @@
import logging import logging
from io import BytesIO from io import BytesIO, TextIOWrapper
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
@ -21,6 +21,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend): class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
print("About to init HTML backend...")
super().__init__(path_or_stream, document_hash) super().__init__(path_or_stream, document_hash)
self.soup = None self.soup = None
# HTML file: # HTML file:
@ -33,6 +34,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[i] = None self.parents[i] = None
self.labels = {} self.labels = {}
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = byte_stream.getvalue().decode("utf-8")
print(text_stream)
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
_log.error("could not parse html: {}".format(e))
return doc
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
@ -52,15 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff # access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(description=DescriptionItem(), name="dummy")
print("Trying to convert HTML...")
try:
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
_log.error("could not parse html: {}".format(e))
return doc
# Replace <br> tags with newline characters # Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"): for br in self.soup.body.find_all("br"):
br.replace_with("\n") br.replace_with("\n")

View File

@ -74,8 +74,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
# Parses the PPTX into a structured document model. # Parses the PPTX into a structured document model.
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash) # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin( origin = DocumentOrigin(
filename=self.path_or_stream.name, filename=fname,
mimetype="application/vnd.ms-powerpoint", mimetype="application/vnd.ms-powerpoint",
binary_hash=self.document_hash, binary_hash=self.document_hash,
) )

View File

@ -18,7 +18,6 @@ input_paths = [
Path("tests/data/word_sample.docx"), Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"), Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2206.01062.pdf"), Path("tests/data/2206.01062.pdf"),
] ]
input = DocumentConversionInput.from_paths(input_paths) input = DocumentConversionInput.from_paths(input_paths)