From 6efcf0a5a5af8dc610daa9f95d9fbfef7472045d Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 11 Oct 2024 16:47:15 +0200
Subject: [PATCH] Add image format support to PdfBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/backend/abstract_backend.py      |  6 ++-
 docling/backend/docling_parse_backend.py | 19 ++++----
 docling/backend/html_backend.py          |  7 +--
 docling/backend/mspowerpoint_backend.py  |  7 +--
 docling/backend/msword_backend.py        |  5 +-
 docling/backend/pdf_backend.py           | 21 +++++++-
 docling/backend/pypdfium2_backend.py     |  9 ++--
 docling/datamodel/document.py            |  4 +-
 docling/document_converter.py            |  5 ++
 examples/run_with_formats.py             | 61 ++++++++----------------
 tests/test_backend_docling_parse.py      | 21 ++++++--
 tests/test_backend_pdfium.py             | 21 ++++++--
 12 files changed, 110 insertions(+), 76 deletions(-)

diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py
index 7d131178..e109ed0a 100644
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@@ -3,6 +3,7 @@ from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
 
+# from docling.datamodel.document import InputDocument
 from docling_core.types.experimental import DoclingDocument
 
 from docling.datamodel.base_models import InputFormat
@@ -10,9 +11,10 @@ from docling.datamodel.base_models import InputFormat
 
 class AbstractDocumentBackend(ABC):
     @abstractmethod
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         self.path_or_stream = path_or_stream
-        self.document_hash = document_hash
+        self.document_hash = in_doc.document_hash
+        self.input_format = in_doc.format
 
     @abstractmethod
     def is_valid(self) -> bool:
diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py
index e3e2293e..789471e8 100644
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
 
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import Cell
+from docling.datamodel.document import InputDocument
 
 _log = logging.getLogger(__name__)
 
@@ -187,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
 
 
 class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
 
-        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
         self.parser = pdf_parser()
 
         success = False
-        if isinstance(path_or_stream, BytesIO):
+        if isinstance(self.path_or_stream, BytesIO):
             success = self.parser.load_document_from_bytesio(
-                document_hash, path_or_stream
+                self.document_hash, self.path_or_stream
+            )
+        elif isinstance(self.path_or_stream, Path):
+            success = self.parser.load_document(
+                self.document_hash, str(self.path_or_stream)
             )
-        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(document_hash, str(path_or_stream))
 
         if not success:
             raise RuntimeError(
-                f"docling-parse could not load document with hash {document_hash}."
+                f"docling-parse could not load document with hash {self.document_hash}."
             )
 
     def page_count(self) -> int:
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 216d156d..b536d2ff 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -15,14 +15,15 @@ from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
 
 _log = logging.getLogger(__name__)
 
 
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
         _log.debug("About to init HTML backend...")
-        super().__init__(path_or_stream, document_hash)
         self.soup = None
         # HTML file:
         self.path_or_stream = path_or_stream
@@ -44,7 +45,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     self.soup = BeautifulSoup(html_content, "html.parser")
         except Exception as e:
             raise RuntimeError(
-                f"Could not initialize HTML backend for file with hash {document_hash}."
+                f"Could not initialize HTML backend for file with hash {self.document_hash}."
             ) from e
 
     def is_valid(self) -> bool:
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index b67c3ca3..2914e1e0 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -23,13 +23,14 @@ from docling.backend.abstract_backend import (
     PaginatedDocumentBackend,
 )
 from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
 
 _log = logging.getLogger(__name__)
 
 
 class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
         self.namespaces = {
             "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
             "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
@@ -45,7 +46,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
             self.valid = True
         except Exception as e:
             raise RuntimeError(
-                f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
+                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
             ) from e
 
         return
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index c3504b33..ed7c065c 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -17,20 +17,21 @@ from lxml import etree
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
 
 _log = logging.getLogger(__name__)
 
 
 class MsWordDocumentBackend(DeclarativeDocumentBackend):
 
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
         self.XML_KEY = (
             "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
         )
         self.xml_namespaces = {
             "w": "http://schemas.microsoft.com/office/word/2003/wordml"
         }
-        super().__init__(path_or_stream, document_hash)
         # self.initialise(path_or_stream)
         # Word file:
         self.path_or_stream = path_or_stream
diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py
index daf04321..5fe5be3d 100644
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@@ -1,11 +1,14 @@
 from abc import ABC, abstractmethod
-from typing import Iterable, Optional, Set
+from io import BytesIO
+from typing import Iterable, Optional, Set, Union
 
+from docling_core.types.doc.doc_ocr import Path
 from docling_core.types.experimental import BoundingBox, Size
 from PIL import Image
 
 from docling.backend.abstract_backend import PaginatedDocumentBackend
 from docling.datamodel.base_models import Cell, InputFormat
+from docling.datamodel.document import InputDocument
 
 
 class PdfPageBackend(ABC):
@@ -42,6 +45,22 @@ class PdfPageBackend(ABC):
 
 
 class PdfDocumentBackend(PaginatedDocumentBackend):
+
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        if self.input_format is not InputFormat.PDF:
+            if self.input_format is InputFormat.IMAGE:
+                buf = BytesIO()
+                img = Image.open(self.path_or_stream)
+                img.save(buf, "PDF")
+                buf.seek(0)
+                self.path_or_stream = buf
+            else:
+                raise RuntimeError(
+                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
+                )
+
     @abstractmethod
     def load_page(self, page_no: int) -> PdfPageBackend:
         pass
diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py
index e4c6e423..4fdbdbb9 100644
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -232,13 +232,14 @@ class PyPdfiumPageBackend(PdfPageBackend):
 
 
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
         try:
-            self._pdoc = pdfium.PdfDocument(path_or_stream)
+            self._pdoc = pdfium.PdfDocument(self.path_or_stream)
         except PdfiumError as e:
             raise RuntimeError(
-                f"pypdfium could not load document with hash {document_hash}"
+                f"pypdfium could not load document with hash {self.document_hash}"
             ) from e
 
     def page_count(self) -> int:
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index 9240c6b5..3bcbc080 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -149,9 +149,7 @@ class InputDocument(BaseModel):
                 f"Please check your format configuration on DocumentConverter."
             )
 
-        self._backend = backend(
-            path_or_stream=path_or_stream, document_hash=self.document_hash
-        )
+        self._backend = backend(self, path_or_stream=path_or_stream)
 
 
 class DocumentFormat(str, Enum):
diff --git a/docling/document_converter.py b/docling/document_converter.py
index f354d58b..adecaca1 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -61,6 +61,11 @@ class PdfFormatOption(FormatOption):
     backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
 
 
+class ImageFormatOption(FormatOption):
+    pipeline_cls: Type = StandardPdfModelPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
+
+
 _format_to_default_options = {
     InputFormat.DOCX: FormatOption(
         pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py
index f93db241..4e1b0841 100644
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@@ -21,57 +21,34 @@ input_paths = [
     Path("tests/data/word_sample.docx"),
     Path("tests/data/lorem_ipsum.docx"),
     Path("tests/data/powerpoint_sample.pptx"),
+    Path("tests/data/2305.03393v1-pg9-img.png"),
     Path("tests/data/2206.01062.pdf"),
-    # Path("tests/data/2305.03393v1-pg9-img.png"),
 ]
 
 ## for defaults use:
 # doc_converter = DocumentConverter()
 
 ## to customize use:
-doc_converter = DocumentConverter(  # all of the below is optional, has internal defaults.
-    formats=[
-        InputFormat.PDF,
-        # InputFormat.IMAGE,
-        InputFormat.DOCX,
-        InputFormat.HTML,
-        InputFormat.PPTX,
-    ],  # whitelist formats, other files are ignored.
-    format_options={
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
-        ),  # PdfFormatOption(backend=PyPdfiumDocumentBackend),
-        InputFormat.DOCX: WordFormatOption(
-            pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
-        ),
-        # InputFormat.IMAGE: PdfFormatOption(),
-    },
+doc_converter = (
+    DocumentConverter(  # all of the below is optional, has internal defaults.
+        formats=[
+            InputFormat.PDF,
+            InputFormat.IMAGE,
+            InputFormat.DOCX,
+            InputFormat.HTML,
+            InputFormat.PPTX,
+        ],  # whitelist formats, non-matching files are ignored.
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
+            ),
+            InputFormat.DOCX: WordFormatOption(
+                pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
+            ),
+        },
+    )
 )
 
-doc_converter = DocumentConverter(  # all of the below is optional, has internal defaults.
-    pdf=None,
-    docx=WordFormatOption(
-        pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
-    ),
-    formats=[
-        InputFormat.PDF,
-        # InputFormat.IMAGE,
-        InputFormat.DOCX,
-        InputFormat.HTML,
-        InputFormat.PPTX,
-    ],  # whitelist formats, other files are ignored.
-    format_options={
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
-        ),  # PdfFormatOption(backend=PyPdfiumDocumentBackend),
-        InputFormat.DOCX: WordFormatOption(
-            pipeline_cls=SimpleModelPipeline  # , backend=MsWordDocumentBackend
-        ),
-        # InputFormat.IMAGE: PdfFormatOption(),
-    },
-)
-
-
 conv_results = doc_converter.convert_all(input_paths)
 
 for res in conv_results:
diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py
index 2b8e2f4d..fac71e70 100644
--- a/tests/test_backend_docling_parse.py
+++ b/tests/test_backend_docling_parse.py
@@ -7,6 +7,8 @@ from docling.backend.docling_parse_backend import (
     DoclingParseDocumentBackend,
     DoclingParsePageBackend,
 )
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
 
 
 @pytest.fixture
@@ -14,10 +16,21 @@ def test_doc_path():
     return Path("./tests/data/2206.01062.pdf")
 
 
+def _get_backend(pdf_doc):
+    in_doc = InputDocument(
+        path_or_stream=pdf_doc,
+        format=InputFormat.PDF,
+        backend=DoclingParseDocumentBackend,
+    )
+
+    doc_backend = in_doc._backend
+    return doc_backend
+
+
 def test_text_cell_counts():
     pdf_doc = Path("./tests/data/redp5695.pdf")
 
-    doc_backend = DoclingParseDocumentBackend(pdf_doc, "123456xyz")
+    doc_backend = _get_backend(pdf_doc)
 
     for page_index in range(0, doc_backend.page_count()):
         last_cell_count = None
@@ -36,7 +49,7 @@ def test_text_cell_counts():
 
 
 def test_get_text_from_rect(test_doc_path):
-    doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
+    doc_backend = _get_backend(test_doc_path)
     page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
 
     # Get the title text of the DocLayNet paper
@@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
 
 
 def test_crop_page_image(test_doc_path):
-    doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
+    doc_backend = _get_backend(test_doc_path)
     page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
 
     # Crop out "Figure 1" from the DocLayNet paper
@@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
 
 
 def test_num_pages(test_doc_path):
-    doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
+    doc_backend = _get_backend(test_doc_path)
     doc_backend.page_count() == 9
diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py
index c3050b34..9c0c3dd1 100644
--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
@@ -7,6 +7,8 @@ from docling.backend.pypdfium2_backend import (
     PyPdfiumDocumentBackend,
     PyPdfiumPageBackend,
 )
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
 
 
 @pytest.fixture
@@ -14,10 +16,21 @@ def test_doc_path():
     return Path("./tests/data/2206.01062.pdf")
 
 
+def _get_backend(pdf_doc):
+    in_doc = InputDocument(
+        path_or_stream=pdf_doc,
+        format=InputFormat.PDF,
+        backend=PyPdfiumDocumentBackend,
+    )
+
+    doc_backend = in_doc._backend
+    return doc_backend
+
+
 def test_text_cell_counts():
     pdf_doc = Path("./tests/data/redp5695.pdf")
 
-    doc_backend = PyPdfiumDocumentBackend(pdf_doc, "123456xyz")
+    doc_backend = _get_backend(pdf_doc)
 
     for page_index in range(0, doc_backend.page_count()):
         last_cell_count = None
@@ -36,7 +49,7 @@ def test_text_cell_counts():
 
 
 def test_get_text_from_rect(test_doc_path):
-    doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
+    doc_backend = _get_backend(test_doc_path)
     page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
 
     # Get the title text of the DocLayNet paper
@@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
 
 
 def test_crop_page_image(test_doc_path):
-    doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
+    doc_backend = _get_backend(test_doc_path)
     page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
 
     # Crop out "Figure 1" from the DocLayNet paper
@@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
 
 
 def test_num_pages(test_doc_path):
-    doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
+    doc_backend = _get_backend(test_doc_path)
     doc_backend.page_count() == 9