fix(pypdfium): resolve overlapping text when merging bounding boxes (#1549)

get merged_text from boundingbox instead of merging it to prevent overlaps

Signed-off-by: Pedro Ribeiro <pedro_ribeiro_93@hotmail.com>
This commit is contained in:
Pedro Ribeiro
2025-05-19 14:26:00 +01:00
committed by GitHub
parent 12a0e64892
commit 98b5eeb844
52 changed files with 52225 additions and 4690 deletions

View File

@@ -75,3 +75,16 @@ def test_crop_page_image(test_doc_path):
def test_num_pages(test_doc_path):
doc_backend = _get_backend(test_doc_path)
doc_backend.page_count() == 9
def test_merge_row():
pdf_doc = Path("./tests/data/pdf/multi_page.pdf")
doc_backend = _get_backend(pdf_doc)
page_backend: PyPdfiumPageBackend = doc_backend.load_page(4)
cell = page_backend.get_text_cells()[0]
assert (
cell.text
== "The journey of the word processor—from clunky typewriters to AI-powered platforms—"
)