From 8979abd865b3a4155a5d28a818be8511b157d87f Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Thu, 6 Feb 2025 19:06:32 +0100
Subject: [PATCH] Revert unwanted RTL additions

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/backend/docling_parse_v2_backend.py |  8 +-------
 docling/datamodel/base_models.py            |  6 ------
 docling/models/page_assemble_model.py       | 22 +++++----------------
 docs/examples/batch_convert.py              | 13 ++++--------
 tests/test_backend_docling_parse_v2.py      |  9 ---------
 5 files changed, 10 insertions(+), 48 deletions(-)

diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py
index c905203e..27a368f9 100644
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@@ -11,7 +11,7 @@ from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell, Size, TextDirection
+from docling.datamodel.base_models import Cell, Size
 
 if TYPE_CHECKING:
     from docling.datamodel.document import InputDocument
@@ -97,7 +97,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
             y0 = cell_data[cells_header.index("y0")]
             x1 = cell_data[cells_header.index("x1")]
             y1 = cell_data[cells_header.index("y1")]
-            ltr = cell_data[cells_header.index("left_to_right")]
 
             if x1 < x0:
                 x0, x1 = x1, x0
@@ -117,11 +116,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                         t=y1 * page_size.height / parser_height,
                         coord_origin=CoordOrigin.BOTTOMLEFT,
                     ).to_top_left_origin(page_size.height),
-                    text_direction=(
-                        TextDirection.LEFT_TO_RIGHT
-                        if ltr
-                        else TextDirection.RIGHT_TO_LEFT
-                    ),
                 )
             )
             cell_counter += 1
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index 6bd5b61d..d1e7ce3a 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -120,16 +120,10 @@ class ErrorItem(BaseModel):
     error_message: str
 
 
-class TextDirection(str, Enum):
-    LEFT_TO_RIGHT = "left_to_right"
-    RIGHT_TO_LEFT = "right_to_left"
-
-
 class Cell(BaseModel):
     id: int
     text: str
     bbox: BoundingBox
-    text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT
 
 
 class OcrCell(Cell):
diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py
index 38b23215..4acf8c95 100644
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@@ -11,7 +11,6 @@ from docling.datamodel.base_models import (
     Page,
     PageElement,
     Table,
-    TextDirection,
     TextElement,
 )
 from docling.datamodel.document import ConversionResult
@@ -76,23 +75,12 @@ class PageAssembleModel(BasePageModel):
                     for cluster in page.predictions.layout.clusters:
                         # _log.info("Cluster label seen:", cluster.label)
                         if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
-                            textlines = []
-
-                            dominant_text_direction = TextDirection.LEFT_TO_RIGHT
-
-                            # Naive code: dominant text direction == direction of first cell.
-                            for cell in cluster.cells:
-                                dominant_text_direction = cell.text_direction
-                                break
-
-                            for cell in cluster.cells:
-                                text = cell.text.replace("\x02", "-").strip()
-                                if text:
-                                    # if dominant_text_direction == TextDirection.RIGHT_TO_LEFT:
-                                    #    textlines.insert(0, text)  # Prepend RTL text
-                                    # else:
-                                    textlines.append(text)  # Append LTR text
 
+                            textlines = [
+                                cell.text.replace("\x02", "-").strip()
+                                for cell in cluster.cells
+                                if len(cell.text.strip()) > 0
+                            ]
                             text = self.sanitize_text(textlines)
                             text_el = TextElement(
                                 label=cluster.label,
diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py
index 4e00e353..f6ad92bd 100644
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@@ -49,8 +49,6 @@ def export_documents(
                 with (output_dir / f"{doc_filename}.md").open("w") as fp:
                     fp.write(conv_res.document.export_to_markdown())
 
-                conv_res.document.save_as_html(output_dir / f"{doc_filename}.html")
-
                 # Export Docling document format to text:
                 with (output_dir / f"{doc_filename}.txt").open("w") as fp:
                     fp.write(conv_res.document.export_to_markdown(strict_text=True))
@@ -105,13 +103,10 @@ def main():
     logging.basicConfig(level=logging.INFO)
 
     input_doc_paths = [
-        Path("./tests/data/pdf/right_to_left_01.pdf"),
-        Path("./tests/data/pdf/right_to_left_02.pdf"),
-        Path("./tests/data/pdf/right_to_left_03.pdf"),
-        Path("./tests/data/pdf/2206.01062.pdf"),
-        Path("./tests/data/pdf/2203.01017v2.pdf"),
-        Path("./tests/data/pdf/2305.03393v1.pdf"),
-        Path("./tests/data/pdf/redp5110_sampled.pdf"),
+        Path("./tests/data/2206.01062.pdf"),
+        Path("./tests/data/2203.01017v2.pdf"),
+        Path("./tests/data/2305.03393v1.pdf"),
+        Path("./tests/data/redp5110_sampled.pdf"),
     ]
 
     # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py
index 024ff211..ee0e5c75 100644
--- a/tests/test_backend_docling_parse_v2.py
+++ b/tests/test_backend_docling_parse_v2.py
@@ -26,15 +26,6 @@ def _get_backend(pdf_doc):
     return doc_backend
 
 
-def test_cell_ordering():
-    pdf_doc = Path("tests/data/pdf/right_to_left_01.pdf")
-    doc_backend = _get_backend(pdf_doc)
-
-    for page_index in range(0, doc_backend.page_count()):
-        page_backend: DoclingParseV2PageBackend = doc_backend.load_page(page_index)
-        cells = list(page_backend.get_text_cells())
-
-
 def test_text_cell_counts():
     pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")