Add image format support to PdfBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-11 16:47:15 +02:00
parent d0fccb9342
commit 6efcf0a5a5
12 changed files with 110 additions and 76 deletions

View File

@ -3,6 +3,7 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
# from docling.datamodel.document import InputDocument
from docling_core.types.experimental import DoclingDocument from docling_core.types.experimental import DoclingDocument
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -10,9 +11,10 @@ from docling.datamodel.base_models import InputFormat
class AbstractDocumentBackend(ABC): class AbstractDocumentBackend(ABC):
@abstractmethod @abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
self.document_hash = document_hash self.document_hash = in_doc.document_hash
self.input_format = in_doc.format
@abstractmethod @abstractmethod
def is_valid(self) -> bool: def is_valid(self) -> bool:

View File

@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell from docling.datamodel.base_models import Cell
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -187,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
class DoclingParseDocumentBackend(PdfDocumentBackend): class DoclingParseDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(path_or_stream, document_hash) super().__init__(in_doc, path_or_stream)
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = pdf_parser() self.parser = pdf_parser()
success = False success = False
if isinstance(path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio( success = self.parser.load_document_from_bytesio(
document_hash, path_or_stream self.document_hash, self.path_or_stream
)
elif isinstance(self.path_or_stream, Path):
success = self.parser.load_document(
self.document_hash, str(self.path_or_stream)
) )
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(document_hash, str(path_or_stream))
if not success: if not success:
raise RuntimeError( raise RuntimeError(
f"docling-parse could not load document with hash {document_hash}." f"docling-parse could not load document with hash {self.document_hash}."
) )
def page_count(self) -> int: def page_count(self) -> int:

View File

@ -15,14 +15,15 @@ from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend): class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...") _log.debug("About to init HTML backend...")
super().__init__(path_or_stream, document_hash)
self.soup = None self.soup = None
# HTML file: # HTML file:
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream
@ -44,7 +45,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.soup = BeautifulSoup(html_content, "html.parser") self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
f"Could not initialize HTML backend for file with hash {document_hash}." f"Could not initialize HTML backend for file with hash {self.document_hash}."
) from e ) from e
def is_valid(self) -> bool: def is_valid(self) -> bool:

View File

@ -23,13 +23,14 @@ from docling.backend.abstract_backend import (
PaginatedDocumentBackend, PaginatedDocumentBackend,
) )
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend): class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(path_or_stream, document_hash) super().__init__(in_doc, path_or_stream)
self.namespaces = { self.namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main", "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart", "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
@ -45,7 +46,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.valid = True self.valid = True
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}" f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
) from e ) from e
return return

View File

@ -17,20 +17,21 @@ from lxml import etree
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class MsWordDocumentBackend(DeclarativeDocumentBackend): class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.XML_KEY = ( self.XML_KEY = (
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
) )
self.xml_namespaces = { self.xml_namespaces = {
"w": "http://schemas.microsoft.com/office/word/2003/wordml" "w": "http://schemas.microsoft.com/office/word/2003/wordml"
} }
super().__init__(path_or_stream, document_hash)
# self.initialise(path_or_stream) # self.initialise(path_or_stream)
# Word file: # Word file:
self.path_or_stream = path_or_stream self.path_or_stream = path_or_stream

View File

@ -1,11 +1,14 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Iterable, Optional, Set from io import BytesIO
from typing import Iterable, Optional, Set, Union
from docling_core.types.doc.doc_ocr import Path
from docling_core.types.experimental import BoundingBox, Size from docling_core.types.experimental import BoundingBox, Size
from PIL import Image from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat from docling.datamodel.base_models import Cell, InputFormat
from docling.datamodel.document import InputDocument
class PdfPageBackend(ABC): class PdfPageBackend(ABC):
@ -42,6 +45,22 @@ class PdfPageBackend(ABC):
class PdfDocumentBackend(PaginatedDocumentBackend): class PdfDocumentBackend(PaginatedDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
if self.input_format is not InputFormat.PDF:
if self.input_format is InputFormat.IMAGE:
buf = BytesIO()
img = Image.open(self.path_or_stream)
img.save(buf, "PDF")
buf.seek(0)
self.path_or_stream = buf
else:
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
)
@abstractmethod @abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend: def load_page(self, page_no: int) -> PdfPageBackend:
pass pass

View File

@ -232,13 +232,14 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(path_or_stream, document_hash) super().__init__(in_doc, path_or_stream)
try: try:
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(self.path_or_stream)
except PdfiumError as e: except PdfiumError as e:
raise RuntimeError( raise RuntimeError(
f"pypdfium could not load document with hash {document_hash}" f"pypdfium could not load document with hash {self.document_hash}"
) from e ) from e
def page_count(self) -> int: def page_count(self) -> int:

View File

@ -149,9 +149,7 @@ class InputDocument(BaseModel):
f"Please check your format configuration on DocumentConverter." f"Please check your format configuration on DocumentConverter."
) )
self._backend = backend( self._backend = backend(self, path_or_stream=path_or_stream)
path_or_stream=path_or_stream, document_hash=self.document_hash
)
class DocumentFormat(str, Enum): class DocumentFormat(str, Enum):

View File

@ -61,6 +61,11 @@ class PdfFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfModelPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
_format_to_default_options = { _format_to_default_options = {
InputFormat.DOCX: FormatOption( InputFormat.DOCX: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend

View File

@ -21,57 +21,34 @@ input_paths = [
Path("tests/data/word_sample.docx"), Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"), Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"), Path("tests/data/2206.01062.pdf"),
# Path("tests/data/2305.03393v1-pg9-img.png"),
] ]
## for defaults use: ## for defaults use:
# doc_converter = DocumentConverter() # doc_converter = DocumentConverter()
## to customize use: ## to customize use:
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. doc_converter = (
formats=[ DocumentConverter( # all of the below is optional, has internal defaults.
InputFormat.PDF, formats=[
# InputFormat.IMAGE, InputFormat.PDF,
InputFormat.DOCX, InputFormat.IMAGE,
InputFormat.HTML, InputFormat.DOCX,
InputFormat.PPTX, InputFormat.HTML,
], # whitelist formats, other files are ignored. InputFormat.PPTX,
format_options={ ], # whitelist formats, non-matching files are ignored.
InputFormat.PDF: PdfFormatOption( format_options={
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend InputFormat.PDF: PdfFormatOption(
), # PdfFormatOption(backend=PyPdfiumDocumentBackend), pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
InputFormat.DOCX: WordFormatOption( ),
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend InputFormat.DOCX: WordFormatOption(
), pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
# InputFormat.IMAGE: PdfFormatOption(), ),
}, },
)
) )
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
pdf=None,
docx=WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
formats=[
InputFormat.PDF,
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
# InputFormat.IMAGE: PdfFormatOption(),
},
)
conv_results = doc_converter.convert_all(input_paths) conv_results = doc_converter.convert_all(input_paths)
for res in conv_results: for res in conv_results:

View File

@ -7,6 +7,8 @@ from docling.backend.docling_parse_backend import (
DoclingParseDocumentBackend, DoclingParseDocumentBackend,
DoclingParsePageBackend, DoclingParsePageBackend,
) )
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@pytest.fixture @pytest.fixture
@ -14,10 +16,21 @@ def test_doc_path():
return Path("./tests/data/2206.01062.pdf") return Path("./tests/data/2206.01062.pdf")
def _get_backend(pdf_doc):
in_doc = InputDocument(
path_or_stream=pdf_doc,
format=InputFormat.PDF,
backend=DoclingParseDocumentBackend,
)
doc_backend = in_doc._backend
return doc_backend
def test_text_cell_counts(): def test_text_cell_counts():
pdf_doc = Path("./tests/data/redp5695.pdf") pdf_doc = Path("./tests/data/redp5695.pdf")
doc_backend = DoclingParseDocumentBackend(pdf_doc, "123456xyz") doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(0, doc_backend.page_count()):
last_cell_count = None last_cell_count = None
@ -36,7 +49,7 @@ def test_text_cell_counts():
def test_get_text_from_rect(test_doc_path): def test_get_text_from_rect(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz") doc_backend = _get_backend(test_doc_path)
page_backend: DoclingParsePageBackend = doc_backend.load_page(0) page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Get the title text of the DocLayNet paper # Get the title text of the DocLayNet paper
@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
def test_crop_page_image(test_doc_path): def test_crop_page_image(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz") doc_backend = _get_backend(test_doc_path)
page_backend: DoclingParsePageBackend = doc_backend.load_page(0) page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
def test_num_pages(test_doc_path): def test_num_pages(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz") doc_backend = _get_backend(test_doc_path)
doc_backend.page_count() == 9 doc_backend.page_count() == 9

View File

@ -7,6 +7,8 @@ from docling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend, PyPdfiumDocumentBackend,
PyPdfiumPageBackend, PyPdfiumPageBackend,
) )
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@pytest.fixture @pytest.fixture
@ -14,10 +16,21 @@ def test_doc_path():
return Path("./tests/data/2206.01062.pdf") return Path("./tests/data/2206.01062.pdf")
def _get_backend(pdf_doc):
in_doc = InputDocument(
path_or_stream=pdf_doc,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
)
doc_backend = in_doc._backend
return doc_backend
def test_text_cell_counts(): def test_text_cell_counts():
pdf_doc = Path("./tests/data/redp5695.pdf") pdf_doc = Path("./tests/data/redp5695.pdf")
doc_backend = PyPdfiumDocumentBackend(pdf_doc, "123456xyz") doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(0, doc_backend.page_count()):
last_cell_count = None last_cell_count = None
@ -36,7 +49,7 @@ def test_text_cell_counts():
def test_get_text_from_rect(test_doc_path): def test_get_text_from_rect(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz") doc_backend = _get_backend(test_doc_path)
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Get the title text of the DocLayNet paper # Get the title text of the DocLayNet paper
@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
def test_crop_page_image(test_doc_path): def test_crop_page_image(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz") doc_backend = _get_backend(test_doc_path)
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
def test_num_pages(test_doc_path): def test_num_pages(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz") doc_backend = _get_backend(test_doc_path)
doc_backend.page_count() == 9 doc_backend.page_count() == 9