mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Improved backends
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
1d55cbdca9
commit
07d952acf9
@ -1,5 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO, TextIOWrapper
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
@ -27,6 +27,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||||
|
print("About to init HTML backend...")
|
||||||
super().__init__(path_or_stream, document_hash)
|
super().__init__(path_or_stream, document_hash)
|
||||||
self.soup = None
|
self.soup = None
|
||||||
# HTML file:
|
# HTML file:
|
||||||
@ -39,6 +40,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
self.labels = {}
|
self.labels = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
|
text_stream = byte_stream.getvalue().decode("utf-8")
|
||||||
|
print(text_stream)
|
||||||
|
self.soup = BeautifulSoup(text_stream, "html.parser")
|
||||||
|
if isinstance(self.path_or_stream, Path):
|
||||||
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||||
|
html_content = f.read()
|
||||||
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
except Exception as e:
|
||||||
|
_log.error("could not parse html: {}".format(e))
|
||||||
|
return doc
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -58,15 +72,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# access self.path_or_stream to load stuff
|
# access self.path_or_stream to load stuff
|
||||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||||
|
print("Trying to convert HTML...")
|
||||||
try:
|
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
|
||||||
html_content = f.read()
|
|
||||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
|
||||||
except Exception as e:
|
|
||||||
_log.error("could not parse html: {}".format(e))
|
|
||||||
return doc
|
|
||||||
|
|
||||||
# Replace <br> tags with newline characters
|
# Replace <br> tags with newline characters
|
||||||
for br in self.soup.body.find_all("br"):
|
for br in self.soup.body.find_all("br"):
|
||||||
br.replace_with("\n")
|
br.replace_with("\n")
|
||||||
|
@ -78,8 +78,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
# Parses the PPTX into a structured document model.
|
# Parses the PPTX into a structured document model.
|
||||||
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
||||||
|
|
||||||
|
fname = ""
|
||||||
|
if isinstance(self.path_or_stream, Path):
|
||||||
|
fname = self.path_or_stream.name
|
||||||
|
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
filename=self.path_or_stream.name,
|
filename=fname,
|
||||||
mimetype="application/vnd.ms-powerpoint",
|
mimetype="application/vnd.ms-powerpoint",
|
||||||
binary_hash=self.document_hash,
|
binary_hash=self.document_hash,
|
||||||
)
|
)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
from io import BytesIO, TextIOWrapper
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
@ -9,6 +10,7 @@ from docling.backend.msword_backend import MsWordDocumentBackend
|
|||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
|
DocumentStream,
|
||||||
InputFormat,
|
InputFormat,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
PipelineOptions,
|
PipelineOptions,
|
||||||
@ -29,6 +31,16 @@ input_paths = [
|
|||||||
Path("tests/data/powerpoint_sample.pptx"),
|
Path("tests/data/powerpoint_sample.pptx"),
|
||||||
Path("tests/data/2206.01062.pdf"),
|
Path("tests/data/2206.01062.pdf"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
input_bytes = []
|
||||||
|
for p in input_paths:
|
||||||
|
buf = BytesIO(p.open("rb").read())
|
||||||
|
# tstream = TextIOWrapper(buf, encoding='utf-8')
|
||||||
|
# input_bytes.append(tstream)
|
||||||
|
bstream = DocumentStream(filename=p.name, stream=buf)
|
||||||
|
input_bytes.append(bstream)
|
||||||
|
|
||||||
|
# input = DocumentConversionInput.from_streams(input_bytes)
|
||||||
input = DocumentConversionInput.from_paths(input_paths)
|
input = DocumentConversionInput.from_paths(input_paths)
|
||||||
|
|
||||||
# for defaults use:
|
# for defaults use:
|
||||||
|
Loading…
Reference in New Issue
Block a user