mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Improved backends
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
1d55cbdca9
commit
07d952acf9
@ -1,5 +1,5 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from io import BytesIO, TextIOWrapper
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
@ -27,6 +27,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
print("About to init HTML backend...")
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
self.soup = None
|
||||
# HTML file:
|
||||
@ -39,6 +40,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.parents[i] = None
|
||||
self.labels = {}
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = byte_stream.getvalue().decode("utf-8")
|
||||
print(text_stream)
|
||||
self.soup = BeautifulSoup(text_stream, "html.parser")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||
except Exception as e:
|
||||
_log.error("could not parse html: {}".format(e))
|
||||
return doc
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
@ -58,15 +72,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def convert(self) -> DoclingDocument:
|
||||
# access self.path_or_stream to load stuff
|
||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||
|
||||
try:
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||
except Exception as e:
|
||||
_log.error("could not parse html: {}".format(e))
|
||||
return doc
|
||||
|
||||
print("Trying to convert HTML...")
|
||||
# Replace <br> tags with newline characters
|
||||
for br in self.soup.body.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
|
@ -78,8 +78,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
# Parses the PPTX into a structured document model.
|
||||
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=self.path_or_stream.name,
|
||||
filename=fname,
|
||||
mimetype="application/vnd.ms-powerpoint",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
|
@ -1,5 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
from io import BytesIO, TextIOWrapper
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
@ -9,6 +10,7 @@ from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
InputFormat,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
@ -29,6 +31,16 @@ input_paths = [
|
||||
Path("tests/data/powerpoint_sample.pptx"),
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
]
|
||||
|
||||
input_bytes = []
|
||||
for p in input_paths:
|
||||
buf = BytesIO(p.open("rb").read())
|
||||
# tstream = TextIOWrapper(buf, encoding='utf-8')
|
||||
# input_bytes.append(tstream)
|
||||
bstream = DocumentStream(filename=p.name, stream=buf)
|
||||
input_bytes.append(bstream)
|
||||
|
||||
# input = DocumentConversionInput.from_streams(input_bytes)
|
||||
input = DocumentConversionInput.from_paths(input_paths)
|
||||
|
||||
# for defaults use:
|
||||
|
Loading…
Reference in New Issue
Block a user