diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 27a368f9..c905203e 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -11,7 +11,7 @@ from PIL import Image, ImageDraw from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import Cell, Size +from docling.datamodel.base_models import Cell, Size, TextDirection if TYPE_CHECKING: from docling.datamodel.document import InputDocument @@ -97,6 +97,7 @@ class DoclingParseV2PageBackend(PdfPageBackend): y0 = cell_data[cells_header.index("y0")] x1 = cell_data[cells_header.index("x1")] y1 = cell_data[cells_header.index("y1")] + ltr = cell_data[cells_header.index("left_to_right")] if x1 < x0: x0, x1 = x1, x0 @@ -116,6 +117,11 @@ class DoclingParseV2PageBackend(PdfPageBackend): t=y1 * page_size.height / parser_height, coord_origin=CoordOrigin.BOTTOMLEFT, ).to_top_left_origin(page_size.height), + text_direction=( + TextDirection.LEFT_TO_RIGHT + if ltr + else TextDirection.RIGHT_TO_LEFT + ), ) ) cell_counter += 1 diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index d1e7ce3a..6bd5b61d 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -120,10 +120,16 @@ class ErrorItem(BaseModel): error_message: str +class TextDirection(str, Enum): + LEFT_TO_RIGHT = "left_to_right" + RIGHT_TO_LEFT = "right_to_left" + + class Cell(BaseModel): id: int text: str bbox: BoundingBox + text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT class OcrCell(Cell): diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 6f7de07a..013b89f5 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -264,6 +264,7 @@ class GlmModel: glm_doc = self.model.apply_on_doc(ds_doc_dict) docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental + 1 == 1 # DEBUG code: def draw_clusters_and_cells(ds_document, page_no, show: bool = False): diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 4acf8c95..38b23215 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -11,6 +11,7 @@ from docling.datamodel.base_models import ( Page, PageElement, Table, + TextDirection, TextElement, ) from docling.datamodel.document import ConversionResult @@ -75,12 +76,23 @@ class PageAssembleModel(BasePageModel): for cluster in page.predictions.layout.clusters: # _log.info("Cluster label seen:", cluster.label) if cluster.label in LayoutModel.TEXT_ELEM_LABELS: + textlines = [] + + dominant_text_direction = TextDirection.LEFT_TO_RIGHT + + # Naive code: dominant text direction == direction of first cell. + for cell in cluster.cells: + dominant_text_direction = cell.text_direction + break + + for cell in cluster.cells: + text = cell.text.replace("\x02", "-").strip() + if text: + # if dominant_text_direction == TextDirection.RIGHT_TO_LEFT: + # textlines.insert(0, text) # Prepend RTL text + # else: + textlines.append(text) # Append LTR text - textlines = [ - cell.text.replace("\x02", "-").strip() - for cell in cluster.cells - if len(cell.text.strip()) > 0 - ] text = self.sanitize_text(textlines) text_el = TextElement( label=cluster.label, diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index f6ad92bd..4e00e353 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -49,6 +49,8 @@ def export_documents( with (output_dir / f"{doc_filename}.md").open("w") as fp: fp.write(conv_res.document.export_to_markdown()) + conv_res.document.save_as_html(output_dir / f"{doc_filename}.html") + # Export Docling document format to text: with (output_dir / f"{doc_filename}.txt").open("w") as fp: fp.write(conv_res.document.export_to_markdown(strict_text=True)) @@ -103,10 +105,13 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_paths = [ - Path("./tests/data/2206.01062.pdf"), - Path("./tests/data/2203.01017v2.pdf"), - Path("./tests/data/2305.03393v1.pdf"), - Path("./tests/data/redp5110_sampled.pdf"), + Path("./tests/data/pdf/right_to_left_01.pdf"), + Path("./tests/data/pdf/right_to_left_02.pdf"), + Path("./tests/data/pdf/right_to_left_03.pdf"), + Path("./tests/data/pdf/2206.01062.pdf"), + Path("./tests/data/pdf/2203.01017v2.pdf"), + Path("./tests/data/pdf/2305.03393v1.pdf"), + Path("./tests/data/pdf/redp5110_sampled.pdf"), ] # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py index ee0e5c75..d69b1101 100644 --- a/tests/test_backend_docling_parse_v2.py +++ b/tests/test_backend_docling_parse_v2.py @@ -26,6 +26,17 @@ def _get_backend(pdf_doc): return doc_backend +def test_cell_ordering(): + pdf_doc = Path("tests/data/pdf/right_to_left_01.pdf") + doc_backend = _get_backend(pdf_doc) + + for page_index in range(0, doc_backend.page_count()): + page_backend: DoclingParseV2PageBackend = doc_backend.load_page(page_index) + cells = list(page_backend.get_text_cells()) + + 1 == 1 + + def test_text_cell_counts(): pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")