Add code to expose text direction of cell

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-02-05 12:48:12 +01:00
parent d7c9874a88
commit 7bdd6868ed
6 changed files with 51 additions and 10 deletions

View File

@ -11,7 +11,7 @@ from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size
from docling.datamodel.base_models import Cell, Size, TextDirection
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
@ -97,6 +97,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
ltr = cell_data[cells_header.index("left_to_right")]
if x1 < x0:
x0, x1 = x1, x0
@ -116,6 +117,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
).to_top_left_origin(page_size.height),
text_direction=(
TextDirection.LEFT_TO_RIGHT
if ltr
else TextDirection.RIGHT_TO_LEFT
),
)
)
cell_counter += 1

View File

@ -120,10 +120,16 @@ class ErrorItem(BaseModel):
error_message: str
class TextDirection(str, Enum):
LEFT_TO_RIGHT = "left_to_right"
RIGHT_TO_LEFT = "right_to_left"
class Cell(BaseModel):
id: int
text: str
bbox: BoundingBox
text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT
class OcrCell(Cell):

View File

@ -264,6 +264,7 @@ class GlmModel:
glm_doc = self.model.apply_on_doc(ds_doc_dict)
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
1 == 1
# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):

View File

@ -11,6 +11,7 @@ from docling.datamodel.base_models import (
Page,
PageElement,
Table,
TextDirection,
TextElement,
)
from docling.datamodel.document import ConversionResult
@ -75,12 +76,23 @@ class PageAssembleModel(BasePageModel):
for cluster in page.predictions.layout.clusters:
# _log.info("Cluster label seen:", cluster.label)
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
textlines = []
dominant_text_direction = TextDirection.LEFT_TO_RIGHT
# Naive code: dominant text direction == direction of first cell.
for cell in cluster.cells:
dominant_text_direction = cell.text_direction
break
for cell in cluster.cells:
text = cell.text.replace("\x02", "-").strip()
if text:
# if dominant_text_direction == TextDirection.RIGHT_TO_LEFT:
# textlines.insert(0, text) # Prepend RTL text
# else:
textlines.append(text) # Append LTR text
textlines = [
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
text = self.sanitize_text(textlines)
text_el = TextElement(
label=cluster.label,

View File

@ -49,6 +49,8 @@ def export_documents(
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.document.export_to_markdown())
conv_res.document.save_as_html(output_dir / f"{doc_filename}.html")
# Export Docling document format to text:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.document.export_to_markdown(strict_text=True))
@ -103,10 +105,13 @@ def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
Path("./tests/data/2203.01017v2.pdf"),
Path("./tests/data/2305.03393v1.pdf"),
Path("./tests/data/redp5110_sampled.pdf"),
Path("./tests/data/pdf/right_to_left_01.pdf"),
Path("./tests/data/pdf/right_to_left_02.pdf"),
Path("./tests/data/pdf/right_to_left_03.pdf"),
Path("./tests/data/pdf/2206.01062.pdf"),
Path("./tests/data/pdf/2203.01017v2.pdf"),
Path("./tests/data/pdf/2305.03393v1.pdf"),
Path("./tests/data/pdf/redp5110_sampled.pdf"),
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())

View File

@ -26,6 +26,17 @@ def _get_backend(pdf_doc):
return doc_backend
def test_cell_ordering():
pdf_doc = Path("tests/data/pdf/right_to_left_01.pdf")
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(page_index)
cells = list(page_backend.get_text_cells())
1 == 1
def test_text_cell_counts():
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")