From 8979abd865b3a4155a5d28a818be8511b157d87f Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Thu, 6 Feb 2025 19:06:32 +0100 Subject: [PATCH] Revert unwanted RTL additions Signed-off-by: Christoph Auer --- docling/backend/docling_parse_v2_backend.py | 8 +------- docling/datamodel/base_models.py | 6 ------ docling/models/page_assemble_model.py | 22 +++++---------------- docs/examples/batch_convert.py | 13 ++++-------- tests/test_backend_docling_parse_v2.py | 9 --------- 5 files changed, 10 insertions(+), 48 deletions(-) diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index c905203e..27a368f9 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -11,7 +11,7 @@ from PIL import Image, ImageDraw from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell, Size, TextDirection +from docling.datamodel.base_models import Cell, Size if TYPE_CHECKING: from docling.datamodel.document import InputDocument @@ -97,7 +97,6 @@ class DoclingParseV2PageBackend(PdfPageBackend): y0 = cell_data[cells_header.index("y0")] x1 = cell_data[cells_header.index("x1")] y1 = cell_data[cells_header.index("y1")] - ltr = cell_data[cells_header.index("left_to_right")] if x1 < x0: x0, x1 = x1, x0 @@ -117,11 +116,6 @@ class DoclingParseV2PageBackend(PdfPageBackend): t=y1 * page_size.height / parser_height, coord_origin=CoordOrigin.BOTTOMLEFT, ).to_top_left_origin(page_size.height), - text_direction=( - TextDirection.LEFT_TO_RIGHT - if ltr - else TextDirection.RIGHT_TO_LEFT - ), ) ) cell_counter += 1 diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 6bd5b61d..d1e7ce3a 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -120,16 +120,10 @@ class ErrorItem(BaseModel): error_message: str -class TextDirection(str, Enum): - LEFT_TO_RIGHT = "left_to_right" - RIGHT_TO_LEFT = "right_to_left" - - class Cell(BaseModel): id: int text: str bbox: BoundingBox - text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT class OcrCell(Cell): diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 38b23215..4acf8c95 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -11,7 +11,6 @@ from docling.datamodel.base_models import ( Page, PageElement, Table, - TextDirection, TextElement, ) from docling.datamodel.document import ConversionResult @@ -76,23 +75,12 @@ class PageAssembleModel(BasePageModel): for cluster in page.predictions.layout.clusters: # _log.info("Cluster label seen:", cluster.label) if cluster.label in LayoutModel.TEXT_ELEM_LABELS: - textlines = [] - - dominant_text_direction = TextDirection.LEFT_TO_RIGHT - - # Naive code: dominant text direction == direction of first cell. - for cell in cluster.cells: - dominant_text_direction = cell.text_direction - break - - for cell in cluster.cells: - text = cell.text.replace("\x02", "-").strip() - if text: - # if dominant_text_direction == TextDirection.RIGHT_TO_LEFT: - # textlines.insert(0, text) # Prepend RTL text - # else: - textlines.append(text) # Append LTR text + textlines = [ + cell.text.replace("\x02", "-").strip() + for cell in cluster.cells + if len(cell.text.strip()) > 0 + ] text = self.sanitize_text(textlines) text_el = TextElement( label=cluster.label, diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index 4e00e353..f6ad92bd 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -49,8 +49,6 @@ def export_documents( with (output_dir / f"{doc_filename}.md").open("w") as fp: fp.write(conv_res.document.export_to_markdown()) - conv_res.document.save_as_html(output_dir / f"{doc_filename}.html") - # Export Docling document format to text: with (output_dir / f"{doc_filename}.txt").open("w") as fp: fp.write(conv_res.document.export_to_markdown(strict_text=True)) @@ -105,13 +103,10 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_paths = [ - Path("./tests/data/pdf/right_to_left_01.pdf"), - Path("./tests/data/pdf/right_to_left_02.pdf"), - Path("./tests/data/pdf/right_to_left_03.pdf"), - Path("./tests/data/pdf/2206.01062.pdf"), - Path("./tests/data/pdf/2203.01017v2.pdf"), - Path("./tests/data/pdf/2305.03393v1.pdf"), - Path("./tests/data/pdf/redp5110_sampled.pdf"), + Path("./tests/data/2206.01062.pdf"), + Path("./tests/data/2203.01017v2.pdf"), + Path("./tests/data/2305.03393v1.pdf"), + Path("./tests/data/redp5110_sampled.pdf"), ] # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py index 024ff211..ee0e5c75 100644 --- a/tests/test_backend_docling_parse_v2.py +++ b/tests/test_backend_docling_parse_v2.py @@ -26,15 +26,6 @@ def _get_backend(pdf_doc): return doc_backend -def test_cell_ordering(): - pdf_doc = Path("tests/data/pdf/right_to_left_01.pdf") - doc_backend = _get_backend(pdf_doc) - - for page_index in range(0, doc_backend.page_count()): - page_backend: DoclingParseV2PageBackend = doc_backend.load_page(page_index) - cells = list(page_backend.get_text_cells()) - - def test_text_cell_counts(): pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")