Improved backends

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-10-08 16:37:47 +02:00
parent 1d55cbdca9
commit 07d952acf9
3 changed files with 33 additions and 11 deletions

View File

@ -1,5 +1,5 @@
import logging import logging
from io import BytesIO from io import BytesIO, TextIOWrapper
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
@ -27,6 +27,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend): class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
print("About to init HTML backend...")
super().__init__(path_or_stream, document_hash) super().__init__(path_or_stream, document_hash)
self.soup = None self.soup = None
# HTML file: # HTML file:
@ -39,6 +40,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.parents[i] = None self.parents[i] = None
self.labels = {} self.labels = {}
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = byte_stream.getvalue().decode("utf-8")
print(text_stream)
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
_log.error("could not parse html: {}".format(e))
return doc
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
@ -58,15 +72,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff # access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy") doc = DoclingDocument(description=DescriptionItem(), name="dummy")
print("Trying to convert HTML...")
try:
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
_log.error("could not parse html: {}".format(e))
return doc
# Replace <br> tags with newline characters # Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"): for br in self.soup.body.find_all("br"):
br.replace_with("\n") br.replace_with("\n")

View File

@ -78,8 +78,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
# Parses the PPTX into a structured document model. # Parses the PPTX into a structured document model.
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash) # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin( origin = DocumentOrigin(
filename=self.path_or_stream.name, filename=fname,
mimetype="application/vnd.ms-powerpoint", mimetype="application/vnd.ms-powerpoint",
binary_hash=self.document_hash, binary_hash=self.document_hash,
) )

View File

@ -1,5 +1,6 @@
import json import json
import logging import logging
from io import BytesIO, TextIOWrapper
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
@ -9,6 +10,7 @@ from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
DocumentStream,
InputFormat, InputFormat,
PdfPipelineOptions, PdfPipelineOptions,
PipelineOptions, PipelineOptions,
@ -29,6 +31,16 @@ input_paths = [
Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2206.01062.pdf"), Path("tests/data/2206.01062.pdf"),
] ]
input_bytes = []
for p in input_paths:
buf = BytesIO(p.open("rb").read())
# tstream = TextIOWrapper(buf, encoding='utf-8')
# input_bytes.append(tstream)
bstream = DocumentStream(filename=p.name, stream=buf)
input_bytes.append(bstream)
# input = DocumentConversionInput.from_streams(input_bytes)
input = DocumentConversionInput.from_paths(input_paths) input = DocumentConversionInput.from_paths(input_paths)
# for defaults use: # for defaults use: