fix: change argument to single doctags string

Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
Yusik Kim 2025-03-20 16:02:23 +01:00
parent f6892c7877
commit 38a23eb50b
2 changed files with 19 additions and 5 deletions

View File

@ -8,12 +8,18 @@ from docling_core.experimental.serializer.doctags import (
) )
from docling_core.types.doc import DoclingDocument, Size from docling_core.types.doc import DoclingDocument, Size
from docling_core.types.doc.document import DocItem, DocTagsDocument from docling_core.types.doc.document import DocItem, DocTagsDocument
from docling_core.types.doc.tokens import DocumentToken
from PIL import Image as PILImage from PIL import Image as PILImage
def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) -> str: def remove_doctags_content(doctags: str, images: list[PILImage.Image]) -> str:
dt_list = (
doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>")
.removesuffix(f"\n</{DocumentToken.DOCUMENT.value}>")
.split(f"\n<{DocumentToken.PAGE_BREAK.value}>\n")
)
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
cast(list[str | Path], doctags), cast(list[PILImage.Image | Path], images) cast(list[str | Path], dt_list), cast(list[PILImage.Image | Path], images)
) )
doc = DoclingDocument(name="dummy") doc = DoclingDocument(name="dummy")
doc.load_from_doctags(doctags_doc) doc.load_from_doctags(doctags_doc)
@ -23,8 +29,7 @@ def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) ->
doc.add_page(page_no=idx + 1, size=size) doc.add_page(page_no=idx + 1, size=size)
dt_params = DocTagsParams(add_content=False) dt_params = DocTagsParams(add_content=False)
ser = DocTagsDocSerializer(params=dt_params, doc=doc) ser = DocTagsDocSerializer(params=dt_params, doc=doc)
page_items: dict[int, list[SerializationResult]] page_items: dict[int, list[SerializationResult]] = {}
page_items = {}
for item, _ in doc.iterate_items(): for item, _ in doc.iterate_items():
if not isinstance(item, DocItem): if not isinstance(item, DocItem):
continue continue

View File

@ -1,3 +1,4 @@
from docling_core.types.doc.tokens import DocumentToken
from PIL import Image as PILImage from PIL import Image as PILImage
from docling.utils.doctags_utils import remove_doctags_content from docling.utils.doctags_utils import remove_doctags_content
@ -7,7 +8,15 @@ def test_remove_doctags_content():
img = PILImage.open("./tests/data_scanned/ocr_test.png") img = PILImage.open("./tests/data_scanned/ocr_test.png")
with open("./tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt") as f: with open("./tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt") as f:
doctags = f.read() doctags = f.read()
actual = remove_doctags_content([doctags] * 2, [img] * 2) unwrapped = doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>").removesuffix(
f"\n</{DocumentToken.DOCUMENT.value}>"
)
rewrapped = (
f"<{DocumentToken.DOCUMENT.value}>"
+ f"\n<{DocumentToken.PAGE_BREAK.value}>\n".join([unwrapped] * 2)
+ f"\n</{DocumentToken.DOCUMENT.value}>"
)
actual = remove_doctags_content(rewrapped, [img] * 2)
expected = """<doctag><text><loc_58><loc_44><loc_426><loc_91></text> expected = """<doctag><text><loc_58><loc_44><loc_426><loc_91></text>
<page_break> <page_break>
<text><loc_58><loc_44><loc_426><loc_91></text> <text><loc_58><loc_44><loc_426><loc_91></text>