From 6efcf0a5a5af8dc610daa9f95d9fbfef7472045d Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 11 Oct 2024 16:47:15 +0200 Subject: [PATCH] Add image format support to PdfBackend Signed-off-by: Christoph Auer --- docling/backend/abstract_backend.py | 6 ++- docling/backend/docling_parse_backend.py | 19 ++++---- docling/backend/html_backend.py | 7 +-- docling/backend/mspowerpoint_backend.py | 7 +-- docling/backend/msword_backend.py | 5 +- docling/backend/pdf_backend.py | 21 +++++++- docling/backend/pypdfium2_backend.py | 9 ++-- docling/datamodel/document.py | 4 +- docling/document_converter.py | 5 ++ examples/run_with_formats.py | 61 ++++++++---------------- tests/test_backend_docling_parse.py | 21 ++++++-- tests/test_backend_pdfium.py | 21 ++++++-- 12 files changed, 110 insertions(+), 76 deletions(-) diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 7d131178..e109ed0a 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -3,6 +3,7 @@ from io import BytesIO from pathlib import Path from typing import Set, Union +# from docling.datamodel.document import InputDocument from docling_core.types.experimental import DoclingDocument from docling.datamodel.base_models import InputFormat @@ -10,9 +11,10 @@ from docling.datamodel.base_models import InputFormat class AbstractDocumentBackend(ABC): @abstractmethod - def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): self.path_or_stream = path_or_stream - self.document_hash = document_hash + self.document_hash = in_doc.document_hash + self.input_format = in_doc.format @abstractmethod def is_valid(self) -> bool: diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index e3e2293e..789471e8 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -12,6 +12,7 @@ from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import Cell +from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) @@ -187,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend): class DoclingParseDocumentBackend(PdfDocumentBackend): - def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): - super().__init__(path_or_stream, document_hash) + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) - self._pdoc = pdfium.PdfDocument(path_or_stream) + self._pdoc = pdfium.PdfDocument(self.path_or_stream) self.parser = pdf_parser() success = False - if isinstance(path_or_stream, BytesIO): + if isinstance(self.path_or_stream, BytesIO): success = self.parser.load_document_from_bytesio( - document_hash, path_or_stream + self.document_hash, self.path_or_stream + ) + elif isinstance(self.path_or_stream, Path): + success = self.parser.load_document( + self.document_hash, str(self.path_or_stream) ) - elif isinstance(path_or_stream, Path): - success = self.parser.load_document(document_hash, str(path_or_stream)) if not success: raise RuntimeError( - f"docling-parse could not load document with hash {document_hash}." + f"docling-parse could not load document with hash {self.document_hash}." ) def page_count(self) -> int: diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 216d156d..b536d2ff 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -15,14 +15,15 @@ from docling_core.types.experimental.labels import DocItemLabel, GroupLabel from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) class HTMLDocumentBackend(DeclarativeDocumentBackend): - def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) _log.debug("About to init HTML backend...") - super().__init__(path_or_stream, document_hash) self.soup = None # HTML file: self.path_or_stream = path_or_stream @@ -44,7 +45,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: raise RuntimeError( - f"Could not initialize HTML backend for file with hash {document_hash}." + f"Could not initialize HTML backend for file with hash {self.document_hash}." ) from e def is_valid(self) -> bool: diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index b67c3ca3..2914e1e0 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -23,13 +23,14 @@ from docling.backend.abstract_backend import ( PaginatedDocumentBackend, ) from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend): - def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): - super().__init__(path_or_stream, document_hash) + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) self.namespaces = { "a": "http://schemas.openxmlformats.org/drawingml/2006/main", "c": "http://schemas.openxmlformats.org/drawingml/2006/chart", @@ -45,7 +46,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB self.valid = True except Exception as e: raise RuntimeError( - f"MsPowerpointDocumentBackend could not load document with hash {document_hash}" + f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" ) from e return diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index c3504b33..ed7c065c 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -17,20 +17,21 @@ from lxml import etree from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) class MsWordDocumentBackend(DeclarativeDocumentBackend): - def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) self.XML_KEY = ( "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" ) self.xml_namespaces = { "w": "http://schemas.microsoft.com/office/word/2003/wordml" } - super().__init__(path_or_stream, document_hash) # self.initialise(path_or_stream) # Word file: self.path_or_stream = path_or_stream diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index daf04321..5fe5be3d 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -1,11 +1,14 @@ from abc import ABC, abstractmethod -from typing import Iterable, Optional, Set +from io import BytesIO +from typing import Iterable, Optional, Set, Union +from docling_core.types.doc.doc_ocr import Path from docling_core.types.experimental import BoundingBox, Size from PIL import Image from docling.backend.abstract_backend import PaginatedDocumentBackend from docling.datamodel.base_models import Cell, InputFormat +from docling.datamodel.document import InputDocument class PdfPageBackend(ABC): @@ -42,6 +45,22 @@ class PdfPageBackend(ABC): class PdfDocumentBackend(PaginatedDocumentBackend): + + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + if self.input_format is not InputFormat.PDF: + if self.input_format is InputFormat.IMAGE: + buf = BytesIO() + img = Image.open(self.path_or_stream) + img.save(buf, "PDF") + buf.seek(0) + self.path_or_stream = buf + else: + raise RuntimeError( + f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend." + ) + @abstractmethod def load_page(self, page_no: int) -> PdfPageBackend: pass diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index e4c6e423..4fdbdbb9 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -232,13 +232,14 @@ class PyPdfiumPageBackend(PdfPageBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend): - def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): - super().__init__(path_or_stream, document_hash) + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + try: - self._pdoc = pdfium.PdfDocument(path_or_stream) + self._pdoc = pdfium.PdfDocument(self.path_or_stream) except PdfiumError as e: raise RuntimeError( - f"pypdfium could not load document with hash {document_hash}" + f"pypdfium could not load document with hash {self.document_hash}" ) from e def page_count(self) -> int: diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 9240c6b5..3bcbc080 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -149,9 +149,7 @@ class InputDocument(BaseModel): f"Please check your format configuration on DocumentConverter." ) - self._backend = backend( - path_or_stream=path_or_stream, document_hash=self.document_hash - ) + self._backend = backend(self, path_or_stream=path_or_stream) class DocumentFormat(str, Enum): diff --git a/docling/document_converter.py b/docling/document_converter.py index f354d58b..adecaca1 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -61,6 +61,11 @@ class PdfFormatOption(FormatOption): backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend +class ImageFormatOption(FormatOption): + pipeline_cls: Type = StandardPdfModelPipeline + backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend + + _format_to_default_options = { InputFormat.DOCX: FormatOption( pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index f93db241..4e1b0841 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -21,57 +21,34 @@ input_paths = [ Path("tests/data/word_sample.docx"), Path("tests/data/lorem_ipsum.docx"), Path("tests/data/powerpoint_sample.pptx"), + Path("tests/data/2305.03393v1-pg9-img.png"), Path("tests/data/2206.01062.pdf"), - # Path("tests/data/2305.03393v1-pg9-img.png"), ] ## for defaults use: # doc_converter = DocumentConverter() ## to customize use: -doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. - formats=[ - InputFormat.PDF, - # InputFormat.IMAGE, - InputFormat.DOCX, - InputFormat.HTML, - InputFormat.PPTX, - ], # whitelist formats, other files are ignored. - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend - ), # PdfFormatOption(backend=PyPdfiumDocumentBackend), - InputFormat.DOCX: WordFormatOption( - pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend - ), - # InputFormat.IMAGE: PdfFormatOption(), - }, +doc_converter = ( + DocumentConverter( # all of the below is optional, has internal defaults. + formats=[ + InputFormat.PDF, + InputFormat.IMAGE, + InputFormat.DOCX, + InputFormat.HTML, + InputFormat.PPTX, + ], # whitelist formats, non-matching files are ignored. + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend + ), + InputFormat.DOCX: WordFormatOption( + pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend + ), + }, + ) ) -doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. - pdf=None, - docx=WordFormatOption( - pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend - ), - formats=[ - InputFormat.PDF, - # InputFormat.IMAGE, - InputFormat.DOCX, - InputFormat.HTML, - InputFormat.PPTX, - ], # whitelist formats, other files are ignored. - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend - ), # PdfFormatOption(backend=PyPdfiumDocumentBackend), - InputFormat.DOCX: WordFormatOption( - pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend - ), - # InputFormat.IMAGE: PdfFormatOption(), - }, -) - - conv_results = doc_converter.convert_all(input_paths) for res in conv_results: diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py index 2b8e2f4d..fac71e70 100644 --- a/tests/test_backend_docling_parse.py +++ b/tests/test_backend_docling_parse.py @@ -7,6 +7,8 @@ from docling.backend.docling_parse_backend import ( DoclingParseDocumentBackend, DoclingParsePageBackend, ) +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument @pytest.fixture @@ -14,10 +16,21 @@ def test_doc_path(): return Path("./tests/data/2206.01062.pdf") +def _get_backend(pdf_doc): + in_doc = InputDocument( + path_or_stream=pdf_doc, + format=InputFormat.PDF, + backend=DoclingParseDocumentBackend, + ) + + doc_backend = in_doc._backend + return doc_backend + + def test_text_cell_counts(): pdf_doc = Path("./tests/data/redp5695.pdf") - doc_backend = DoclingParseDocumentBackend(pdf_doc, "123456xyz") + doc_backend = _get_backend(pdf_doc) for page_index in range(0, doc_backend.page_count()): last_cell_count = None @@ -36,7 +49,7 @@ def test_text_cell_counts(): def test_get_text_from_rect(test_doc_path): - doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz") + doc_backend = _get_backend(test_doc_path) page_backend: DoclingParsePageBackend = doc_backend.load_page(0) # Get the title text of the DocLayNet paper @@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path): def test_crop_page_image(test_doc_path): - doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz") + doc_backend = _get_backend(test_doc_path) page_backend: DoclingParsePageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper @@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path): def test_num_pages(test_doc_path): - doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz") + doc_backend = _get_backend(test_doc_path) doc_backend.page_count() == 9 diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index c3050b34..9c0c3dd1 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -7,6 +7,8 @@ from docling.backend.pypdfium2_backend import ( PyPdfiumDocumentBackend, PyPdfiumPageBackend, ) +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument @pytest.fixture @@ -14,10 +16,21 @@ def test_doc_path(): return Path("./tests/data/2206.01062.pdf") +def _get_backend(pdf_doc): + in_doc = InputDocument( + path_or_stream=pdf_doc, + format=InputFormat.PDF, + backend=PyPdfiumDocumentBackend, + ) + + doc_backend = in_doc._backend + return doc_backend + + def test_text_cell_counts(): pdf_doc = Path("./tests/data/redp5695.pdf") - doc_backend = PyPdfiumDocumentBackend(pdf_doc, "123456xyz") + doc_backend = _get_backend(pdf_doc) for page_index in range(0, doc_backend.page_count()): last_cell_count = None @@ -36,7 +49,7 @@ def test_text_cell_counts(): def test_get_text_from_rect(test_doc_path): - doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz") + doc_backend = _get_backend(test_doc_path) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) # Get the title text of the DocLayNet paper @@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path): def test_crop_page_image(test_doc_path): - doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz") + doc_backend = _get_backend(test_doc_path) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper @@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path): def test_num_pages(test_doc_path): - doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz") + doc_backend = _get_backend(test_doc_path) doc_backend.page_count() == 9