fix: Test cases for RTL programmatic PDFs and fixes for the formula model (#903)

fix: Support for RTL programmatic documents fix(parser): detect and handle rotated pages fix(parser): fix bug causing duplicated text fix(formula): improve stopping criteria chore: update lock file fix: temporary constrain beautifulsoup * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * cleaned up the data folder in the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * added three test-files for right-to-left Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix black Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * added new gt for test_e2e_conversion Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * added new gt for test_e2e_conversion Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * Add code to expose text direction of cell Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * new test file Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * update lock Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix mypy reports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix example filepaths Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add test data results Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * pin wheel of latest docling-parse release Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use latest docling-core Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove debugging code Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix path to files in example Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Revert unwanted RTL additions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix test data paths in examples Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-10 13:48:13 +00:00 · 2025-02-07 08:43:31 +01:00
parent ed74fe2ec0
commit 9114ada7bc
91 changed files with 620 additions and 313 deletions
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@@ -2,6 +2,7 @@ import re
 from pathlib import Path
 from typing import Iterable, List, Literal, Optional, Tuple, Union

+import numpy as np
 from docling_core.types.doc import (
    CodeItem,
    DocItemLabel,
@@ -103,7 +104,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
                artifacts_path = artifacts_path / self._model_repo_folder

            self.code_formula_model = CodeFormulaPredictor(
-                artifacts_path=artifacts_path,
+                artifacts_path=str(artifacts_path),
                device=device,
                num_threads=accelerator_options.num_threads,
            )
@@ -123,7 +124,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
            repo_id="ds4sd/CodeFormula",
            force_download=force,
            local_dir=local_dir,
-            revision="v1.0.0",
+            revision="v1.0.1",
        )

        return Path(download_path)
@@ -231,7 +232,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
            return

        labels: List[str] = []
-        images: List[Image.Image] = []
+        images: List[Union[Image.Image, np.ndarray]] = []
        elements: List[TextItem] = []
        for el in element_batch:
            assert isinstance(el.item, TextItem)
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Iterable, List, Literal, Optional, Tuple, Union

+import numpy as np
 from docling_core.types.doc import (
    DoclingDocument,
    NodeItem,
@@ -94,7 +95,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
                artifacts_path = artifacts_path / self._model_repo_folder

            self.document_picture_classifier = DocumentFigureClassifierPredictor(
-                artifacts_path=artifacts_path,
+                artifacts_path=str(artifacts_path),
                device=device,
                num_threads=accelerator_options.num_threads,
            )
@@ -161,7 +162,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
                yield element
            return

-        images: List[Image.Image] = []
+        images: List[Union[Image.Image, np.ndarray]] = []
        elements: List[PictureItem] = []
        for el in element_batch:
            assert isinstance(el, PictureItem)
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -150,10 +150,12 @@ class LayoutModel(BasePageModel):
            else:
                with TimeRecorder(conv_res, "layout"):
                    assert page.size is not None
+                    page_image = page.get_image(scale=1.0)
+                    assert page_image is not None

                    clusters = []
                    for ix, pred_item in enumerate(
-                        self.layout_predictor.predict(page.get_image(scale=1.0))
+                        self.layout_predictor.predict(page_image)
                    ):
                        label = DocItemLabel(
                            pred_item["label"]