mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Add image format support to PdfBackend
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
d0fccb9342
commit
6efcf0a5a5
@ -3,6 +3,7 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
# from docling.datamodel.document import InputDocument
|
||||
from docling_core.types.experimental import DoclingDocument
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@ -10,9 +11,10 @@ from docling.datamodel.base_models import InputFormat
|
||||
|
||||
class AbstractDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
self.path_or_stream = path_or_stream
|
||||
self.document_hash = document_hash
|
||||
self.document_hash = in_doc.document_hash
|
||||
self.input_format = in_doc.format
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
|
@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -187,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
|
||||
|
||||
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||
self.parser = pdf_parser()
|
||||
|
||||
success = False
|
||||
if isinstance(path_or_stream, BytesIO):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
success = self.parser.load_document_from_bytesio(
|
||||
document_hash, path_or_stream
|
||||
self.document_hash, self.path_or_stream
|
||||
)
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
success = self.parser.load_document(
|
||||
self.document_hash, str(self.path_or_stream)
|
||||
)
|
||||
elif isinstance(path_or_stream, Path):
|
||||
success = self.parser.load_document(document_hash, str(path_or_stream))
|
||||
|
||||
if not success:
|
||||
raise RuntimeError(
|
||||
f"docling-parse could not load document with hash {document_hash}."
|
||||
f"docling-parse could not load document with hash {self.document_hash}."
|
||||
)
|
||||
|
||||
def page_count(self) -> int:
|
||||
|
@ -15,14 +15,15 @@ from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
_log.debug("About to init HTML backend...")
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
self.soup = None
|
||||
# HTML file:
|
||||
self.path_or_stream = path_or_stream
|
||||
@ -44,7 +45,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize HTML backend for file with hash {document_hash}."
|
||||
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
||||
) from e
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
|
@ -23,13 +23,14 @@ from docling.backend.abstract_backend import (
|
||||
PaginatedDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.namespaces = {
|
||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
|
||||
@ -45,7 +46,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
return
|
||||
|
@ -17,20 +17,21 @@ from lxml import etree
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.XML_KEY = (
|
||||
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||
)
|
||||
self.xml_namespaces = {
|
||||
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
|
||||
}
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
# self.initialise(path_or_stream)
|
||||
# Word file:
|
||||
self.path_or_stream = path_or_stream
|
||||
|
@ -1,11 +1,14 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Iterable, Optional, Set
|
||||
from io import BytesIO
|
||||
from typing import Iterable, Optional, Set, Union
|
||||
|
||||
from docling_core.types.doc.doc_ocr import Path
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from PIL import Image
|
||||
|
||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||
from docling.datamodel.base_models import Cell, InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
class PdfPageBackend(ABC):
|
||||
@ -42,6 +45,22 @@ class PdfPageBackend(ABC):
|
||||
|
||||
|
||||
class PdfDocumentBackend(PaginatedDocumentBackend):
|
||||
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
if self.input_format is not InputFormat.PDF:
|
||||
if self.input_format is InputFormat.IMAGE:
|
||||
buf = BytesIO()
|
||||
img = Image.open(self.path_or_stream)
|
||||
img.save(buf, "PDF")
|
||||
buf.seek(0)
|
||||
self.path_or_stream = buf
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||
pass
|
||||
|
@ -232,13 +232,14 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
|
||||
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
try:
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||
except PdfiumError as e:
|
||||
raise RuntimeError(
|
||||
f"pypdfium could not load document with hash {document_hash}"
|
||||
f"pypdfium could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
def page_count(self) -> int:
|
||||
|
@ -149,9 +149,7 @@ class InputDocument(BaseModel):
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
self._backend = backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
|
||||
|
||||
class DocumentFormat(str, Enum):
|
||||
|
@ -61,6 +61,11 @@ class PdfFormatOption(FormatOption):
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||
|
||||
|
||||
class ImageFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfModelPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||
|
||||
|
||||
_format_to_default_options = {
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
|
||||
|
@ -21,57 +21,34 @@ input_paths = [
|
||||
Path("tests/data/word_sample.docx"),
|
||||
Path("tests/data/lorem_ipsum.docx"),
|
||||
Path("tests/data/powerpoint_sample.pptx"),
|
||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
# Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
]
|
||||
|
||||
## for defaults use:
|
||||
# doc_converter = DocumentConverter()
|
||||
|
||||
## to customize use:
|
||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
doc_converter = (
|
||||
DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
formats=[
|
||||
InputFormat.PDF,
|
||||
# InputFormat.IMAGE,
|
||||
InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
], # whitelist formats, other files are ignored.
|
||||
], # whitelist formats, non-matching files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||
),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
# InputFormat.IMAGE: PdfFormatOption(),
|
||||
},
|
||||
)
|
||||
|
||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
pdf=None,
|
||||
docx=WordFormatOption(
|
||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
formats=[
|
||||
InputFormat.PDF,
|
||||
# InputFormat.IMAGE,
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
], # whitelist formats, other files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
# InputFormat.IMAGE: PdfFormatOption(),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
conv_results = doc_converter.convert_all(input_paths)
|
||||
|
||||
for res in conv_results:
|
||||
|
@ -7,6 +7,8 @@ from docling.backend.docling_parse_backend import (
|
||||
DoclingParseDocumentBackend,
|
||||
DoclingParsePageBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -14,10 +16,21 @@ def test_doc_path():
|
||||
return Path("./tests/data/2206.01062.pdf")
|
||||
|
||||
|
||||
def _get_backend(pdf_doc):
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=pdf_doc,
|
||||
format=InputFormat.PDF,
|
||||
backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
doc_backend = in_doc._backend
|
||||
return doc_backend
|
||||
|
||||
|
||||
def test_text_cell_counts():
|
||||
pdf_doc = Path("./tests/data/redp5695.pdf")
|
||||
|
||||
doc_backend = DoclingParseDocumentBackend(pdf_doc, "123456xyz")
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
@ -36,7 +49,7 @@ def test_text_cell_counts():
|
||||
|
||||
|
||||
def test_get_text_from_rect(test_doc_path):
|
||||
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
|
||||
doc_backend = _get_backend(test_doc_path)
|
||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Get the title text of the DocLayNet paper
|
||||
@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
|
||||
|
||||
|
||||
def test_crop_page_image(test_doc_path):
|
||||
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
|
||||
doc_backend = _get_backend(test_doc_path)
|
||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
|
||||
|
||||
|
||||
def test_num_pages(test_doc_path):
|
||||
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
|
||||
doc_backend = _get_backend(test_doc_path)
|
||||
doc_backend.page_count() == 9
|
||||
|
@ -7,6 +7,8 @@ from docling.backend.pypdfium2_backend import (
|
||||
PyPdfiumDocumentBackend,
|
||||
PyPdfiumPageBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -14,10 +16,21 @@ def test_doc_path():
|
||||
return Path("./tests/data/2206.01062.pdf")
|
||||
|
||||
|
||||
def _get_backend(pdf_doc):
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=pdf_doc,
|
||||
format=InputFormat.PDF,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
)
|
||||
|
||||
doc_backend = in_doc._backend
|
||||
return doc_backend
|
||||
|
||||
|
||||
def test_text_cell_counts():
|
||||
pdf_doc = Path("./tests/data/redp5695.pdf")
|
||||
|
||||
doc_backend = PyPdfiumDocumentBackend(pdf_doc, "123456xyz")
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
@ -36,7 +49,7 @@ def test_text_cell_counts():
|
||||
|
||||
|
||||
def test_get_text_from_rect(test_doc_path):
|
||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
|
||||
doc_backend = _get_backend(test_doc_path)
|
||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Get the title text of the DocLayNet paper
|
||||
@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
|
||||
|
||||
|
||||
def test_crop_page_image(test_doc_path):
|
||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
|
||||
doc_backend = _get_backend(test_doc_path)
|
||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
|
||||
|
||||
|
||||
def test_num_pages(test_doc_path):
|
||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
|
||||
doc_backend = _get_backend(test_doc_path)
|
||||
doc_backend.page_count() == 9
|
||||
|
Loading…
Reference in New Issue
Block a user