mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-29 21:44:32 +00:00
fix: change argument to single doctags string
Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
parent
f6892c7877
commit
38a23eb50b
@ -8,12 +8,18 @@ from docling_core.experimental.serializer.doctags import (
|
||||
)
|
||||
from docling_core.types.doc import DoclingDocument, Size
|
||||
from docling_core.types.doc.document import DocItem, DocTagsDocument
|
||||
from docling_core.types.doc.tokens import DocumentToken
|
||||
from PIL import Image as PILImage
|
||||
|
||||
|
||||
def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) -> str:
|
||||
def remove_doctags_content(doctags: str, images: list[PILImage.Image]) -> str:
|
||||
dt_list = (
|
||||
doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>")
|
||||
.removesuffix(f"\n</{DocumentToken.DOCUMENT.value}>")
|
||||
.split(f"\n<{DocumentToken.PAGE_BREAK.value}>\n")
|
||||
)
|
||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
||||
cast(list[str | Path], doctags), cast(list[PILImage.Image | Path], images)
|
||||
cast(list[str | Path], dt_list), cast(list[PILImage.Image | Path], images)
|
||||
)
|
||||
doc = DoclingDocument(name="dummy")
|
||||
doc.load_from_doctags(doctags_doc)
|
||||
@ -23,8 +29,7 @@ def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) ->
|
||||
doc.add_page(page_no=idx + 1, size=size)
|
||||
dt_params = DocTagsParams(add_content=False)
|
||||
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
||||
page_items: dict[int, list[SerializationResult]]
|
||||
page_items = {}
|
||||
page_items: dict[int, list[SerializationResult]] = {}
|
||||
for item, _ in doc.iterate_items():
|
||||
if not isinstance(item, DocItem):
|
||||
continue
|
||||
|
@ -1,3 +1,4 @@
|
||||
from docling_core.types.doc.tokens import DocumentToken
|
||||
from PIL import Image as PILImage
|
||||
|
||||
from docling.utils.doctags_utils import remove_doctags_content
|
||||
@ -7,7 +8,15 @@ def test_remove_doctags_content():
|
||||
img = PILImage.open("./tests/data_scanned/ocr_test.png")
|
||||
with open("./tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt") as f:
|
||||
doctags = f.read()
|
||||
actual = remove_doctags_content([doctags] * 2, [img] * 2)
|
||||
unwrapped = doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>").removesuffix(
|
||||
f"\n</{DocumentToken.DOCUMENT.value}>"
|
||||
)
|
||||
rewrapped = (
|
||||
f"<{DocumentToken.DOCUMENT.value}>"
|
||||
+ f"\n<{DocumentToken.PAGE_BREAK.value}>\n".join([unwrapped] * 2)
|
||||
+ f"\n</{DocumentToken.DOCUMENT.value}>"
|
||||
)
|
||||
actual = remove_doctags_content(rewrapped, [img] * 2)
|
||||
expected = """<doctag><text><loc_58><loc_44><loc_426><loc_91></text>
|
||||
<page_break>
|
||||
<text><loc_58><loc_44><loc_426><loc_91></text>
|
||||
|
Loading…
Reference in New Issue
Block a user