Revert unwanted RTL additions

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-02-06 19:06:32 +01:00
parent b7f5cdb230
commit 8979abd865
5 changed files with 10 additions and 48 deletions

View File

@ -11,7 +11,7 @@ from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size, TextDirection
from docling.datamodel.base_models import Cell, Size
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
@ -97,7 +97,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
ltr = cell_data[cells_header.index("left_to_right")]
if x1 < x0:
x0, x1 = x1, x0
@ -117,11 +116,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
).to_top_left_origin(page_size.height),
text_direction=(
TextDirection.LEFT_TO_RIGHT
if ltr
else TextDirection.RIGHT_TO_LEFT
),
)
)
cell_counter += 1

View File

@ -120,16 +120,10 @@ class ErrorItem(BaseModel):
error_message: str
class TextDirection(str, Enum):
LEFT_TO_RIGHT = "left_to_right"
RIGHT_TO_LEFT = "right_to_left"
class Cell(BaseModel):
id: int
text: str
bbox: BoundingBox
text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT
class OcrCell(Cell):

View File

@ -11,7 +11,6 @@ from docling.datamodel.base_models import (
Page,
PageElement,
Table,
TextDirection,
TextElement,
)
from docling.datamodel.document import ConversionResult
@ -76,23 +75,12 @@ class PageAssembleModel(BasePageModel):
for cluster in page.predictions.layout.clusters:
# _log.info("Cluster label seen:", cluster.label)
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
textlines = []
dominant_text_direction = TextDirection.LEFT_TO_RIGHT
# Naive code: dominant text direction == direction of first cell.
for cell in cluster.cells:
dominant_text_direction = cell.text_direction
break
for cell in cluster.cells:
text = cell.text.replace("\x02", "-").strip()
if text:
# if dominant_text_direction == TextDirection.RIGHT_TO_LEFT:
# textlines.insert(0, text) # Prepend RTL text
# else:
textlines.append(text) # Append LTR text
textlines = [
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
text = self.sanitize_text(textlines)
text_el = TextElement(
label=cluster.label,

View File

@ -49,8 +49,6 @@ def export_documents(
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.document.export_to_markdown())
conv_res.document.save_as_html(output_dir / f"{doc_filename}.html")
# Export Docling document format to text:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.document.export_to_markdown(strict_text=True))
@ -105,13 +103,10 @@ def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/pdf/right_to_left_01.pdf"),
Path("./tests/data/pdf/right_to_left_02.pdf"),
Path("./tests/data/pdf/right_to_left_03.pdf"),
Path("./tests/data/pdf/2206.01062.pdf"),
Path("./tests/data/pdf/2203.01017v2.pdf"),
Path("./tests/data/pdf/2305.03393v1.pdf"),
Path("./tests/data/pdf/redp5110_sampled.pdf"),
Path("./tests/data/2206.01062.pdf"),
Path("./tests/data/2203.01017v2.pdf"),
Path("./tests/data/2305.03393v1.pdf"),
Path("./tests/data/redp5110_sampled.pdf"),
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())

View File

@ -26,15 +26,6 @@ def _get_backend(pdf_doc):
return doc_backend
def test_cell_ordering():
pdf_doc = Path("tests/data/pdf/right_to_left_01.pdf")
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(page_index)
cells = list(page_backend.get_text_cells())
def test_text_cell_counts():
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")