From 38a23eb50bd9fd3b46a88f5eb09491572b3938e9 Mon Sep 17 00:00:00 2001 From: Yusik Kim Date: Thu, 20 Mar 2025 16:02:23 +0100 Subject: [PATCH] fix: change argument to single doctags string Signed-off-by: Yusik Kim --- docling/utils/doctags_utils.py | 13 +++++++++---- tests/test_doctags_utils.py | 11 ++++++++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/docling/utils/doctags_utils.py b/docling/utils/doctags_utils.py index 1ff7159c..e3e61866 100644 --- a/docling/utils/doctags_utils.py +++ b/docling/utils/doctags_utils.py @@ -8,12 +8,18 @@ from docling_core.experimental.serializer.doctags import ( ) from docling_core.types.doc import DoclingDocument, Size from docling_core.types.doc.document import DocItem, DocTagsDocument +from docling_core.types.doc.tokens import DocumentToken from PIL import Image as PILImage -def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) -> str: +def remove_doctags_content(doctags: str, images: list[PILImage.Image]) -> str: + dt_list = ( + doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>") + .removesuffix(f"\n") + .split(f"\n<{DocumentToken.PAGE_BREAK.value}>\n") + ) doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( - cast(list[str | Path], doctags), cast(list[PILImage.Image | Path], images) + cast(list[str | Path], dt_list), cast(list[PILImage.Image | Path], images) ) doc = DoclingDocument(name="dummy") doc.load_from_doctags(doctags_doc) @@ -23,8 +29,7 @@ def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) -> doc.add_page(page_no=idx + 1, size=size) dt_params = DocTagsParams(add_content=False) ser = DocTagsDocSerializer(params=dt_params, doc=doc) - page_items: dict[int, list[SerializationResult]] - page_items = {} + page_items: dict[int, list[SerializationResult]] = {} for item, _ in doc.iterate_items(): if not isinstance(item, DocItem): continue diff --git a/tests/test_doctags_utils.py b/tests/test_doctags_utils.py index 68f9a652..ca1c9ab0 100644 --- a/tests/test_doctags_utils.py +++ b/tests/test_doctags_utils.py @@ -1,3 +1,4 @@ +from docling_core.types.doc.tokens import DocumentToken from PIL import Image as PILImage from docling.utils.doctags_utils import remove_doctags_content @@ -7,7 +8,15 @@ def test_remove_doctags_content(): img = PILImage.open("./tests/data_scanned/ocr_test.png") with open("./tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt") as f: doctags = f.read() - actual = remove_doctags_content([doctags] * 2, [img] * 2) + unwrapped = doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>").removesuffix( + f"\n" + ) + rewrapped = ( + f"<{DocumentToken.DOCUMENT.value}>" + + f"\n<{DocumentToken.PAGE_BREAK.value}>\n".join([unwrapped] * 2) + + f"\n" + ) + actual = remove_doctags_content(rewrapped, [img] * 2) expected = """