Add image format support to PdfBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-11 16:47:15 +02:00
parent d0fccb9342
commit 6efcf0a5a5
12 changed files with 110 additions and 76 deletions

View File

@ -3,6 +3,7 @@ from io import BytesIO
from pathlib import Path
from typing import Set, Union
# from docling.datamodel.document import InputDocument
from docling_core.types.experimental import DoclingDocument
from docling.datamodel.base_models import InputFormat
@ -10,9 +11,10 @@ from docling.datamodel.base_models import InputFormat
class AbstractDocumentBackend(ABC):
@abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
self.path_or_stream = path_or_stream
self.document_hash = document_hash
self.document_hash = in_doc.document_hash
self.input_format = in_doc.format
@abstractmethod
def is_valid(self) -> bool:

View File

@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@ -187,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
class DoclingParseDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self._pdoc = pdfium.PdfDocument(path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = pdf_parser()
success = False
if isinstance(path_or_stream, BytesIO):
if isinstance(self.path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
document_hash, path_or_stream
self.document_hash, self.path_or_stream
)
elif isinstance(self.path_or_stream, Path):
success = self.parser.load_document(
self.document_hash, str(self.path_or_stream)
)
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(document_hash, str(path_or_stream))
if not success:
raise RuntimeError(
f"docling-parse could not load document with hash {document_hash}."
f"docling-parse could not load document with hash {self.document_hash}."
)
def page_count(self) -> int:

View File

@ -15,14 +15,15 @@ from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...")
super().__init__(path_or_stream, document_hash)
self.soup = None
# HTML file:
self.path_or_stream = path_or_stream
@ -44,7 +45,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
raise RuntimeError(
f"Could not initialize HTML backend for file with hash {document_hash}."
f"Could not initialize HTML backend for file with hash {self.document_hash}."
) from e
def is_valid(self) -> bool:

View File

@ -23,13 +23,14 @@ from docling.backend.abstract_backend import (
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
@ -45,7 +46,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
) from e
return

View File

@ -17,20 +17,21 @@ from lxml import etree
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.XML_KEY = (
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
)
self.xml_namespaces = {
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
}
super().__init__(path_or_stream, document_hash)
# self.initialise(path_or_stream)
# Word file:
self.path_or_stream = path_or_stream

View File

@ -1,11 +1,14 @@
from abc import ABC, abstractmethod
from typing import Iterable, Optional, Set
from io import BytesIO
from typing import Iterable, Optional, Set, Union
from docling_core.types.doc.doc_ocr import Path
from docling_core.types.experimental import BoundingBox, Size
from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat
from docling.datamodel.document import InputDocument
class PdfPageBackend(ABC):
@ -42,6 +45,22 @@ class PdfPageBackend(ABC):
class PdfDocumentBackend(PaginatedDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
if self.input_format is not InputFormat.PDF:
if self.input_format is InputFormat.IMAGE:
buf = BytesIO()
img = Image.open(self.path_or_stream)
img.save(buf, "PDF")
buf.seek(0)
self.path_or_stream = buf
else:
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
)
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass

View File

@ -232,13 +232,14 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
try:
self._pdoc = pdfium.PdfDocument(path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document with hash {document_hash}"
f"pypdfium could not load document with hash {self.document_hash}"
) from e
def page_count(self) -> int:

View File

@ -149,9 +149,7 @@ class InputDocument(BaseModel):
f"Please check your format configuration on DocumentConverter."
)
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
self._backend = backend(self, path_or_stream=path_or_stream)
class DocumentFormat(str, Enum):

View File

@ -61,6 +61,11 @@ class PdfFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfModelPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
_format_to_default_options = {
InputFormat.DOCX: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend

View File

@ -21,57 +21,34 @@ input_paths = [
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
# Path("tests/data/2305.03393v1-pg9-img.png"),
]
## for defaults use:
# doc_converter = DocumentConverter()
## to customize use:
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
formats=[
InputFormat.PDF,
# InputFormat.IMAGE,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored.
], # whitelist formats, non-matching files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
# InputFormat.IMAGE: PdfFormatOption(),
},
)
)
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
pdf=None,
docx=WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
formats=[
InputFormat.PDF,
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
# InputFormat.IMAGE: PdfFormatOption(),
},
)
conv_results = doc_converter.convert_all(input_paths)
for res in conv_results:

View File

@ -7,6 +7,8 @@ from docling.backend.docling_parse_backend import (
DoclingParseDocumentBackend,
DoclingParsePageBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@pytest.fixture
@ -14,10 +16,21 @@ def test_doc_path():
return Path("./tests/data/2206.01062.pdf")
def _get_backend(pdf_doc):
in_doc = InputDocument(
path_or_stream=pdf_doc,
format=InputFormat.PDF,
backend=DoclingParseDocumentBackend,
)
doc_backend = in_doc._backend
return doc_backend
def test_text_cell_counts():
pdf_doc = Path("./tests/data/redp5695.pdf")
doc_backend = DoclingParseDocumentBackend(pdf_doc, "123456xyz")
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
last_cell_count = None
@ -36,7 +49,7 @@ def test_text_cell_counts():
def test_get_text_from_rect(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
doc_backend = _get_backend(test_doc_path)
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Get the title text of the DocLayNet paper
@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
def test_crop_page_image(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
doc_backend = _get_backend(test_doc_path)
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
def test_num_pages(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
doc_backend = _get_backend(test_doc_path)
doc_backend.page_count() == 9

View File

@ -7,6 +7,8 @@ from docling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend,
PyPdfiumPageBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@pytest.fixture
@ -14,10 +16,21 @@ def test_doc_path():
return Path("./tests/data/2206.01062.pdf")
def _get_backend(pdf_doc):
in_doc = InputDocument(
path_or_stream=pdf_doc,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
)
doc_backend = in_doc._backend
return doc_backend
def test_text_cell_counts():
pdf_doc = Path("./tests/data/redp5695.pdf")
doc_backend = PyPdfiumDocumentBackend(pdf_doc, "123456xyz")
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
last_cell_count = None
@ -36,7 +49,7 @@ def test_text_cell_counts():
def test_get_text_from_rect(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
doc_backend = _get_backend(test_doc_path)
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Get the title text of the DocLayNet paper
@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
def test_crop_page_image(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
doc_backend = _get_backend(test_doc_path)
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
def test_num_pages(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
doc_backend = _get_backend(test_doc_path)
doc_backend.page_count() == 9