mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
fix: change argument to single doctags string
Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
parent
f6892c7877
commit
38a23eb50b
@ -8,12 +8,18 @@ from docling_core.experimental.serializer.doctags import (
|
|||||||
)
|
)
|
||||||
from docling_core.types.doc import DoclingDocument, Size
|
from docling_core.types.doc import DoclingDocument, Size
|
||||||
from docling_core.types.doc.document import DocItem, DocTagsDocument
|
from docling_core.types.doc.document import DocItem, DocTagsDocument
|
||||||
|
from docling_core.types.doc.tokens import DocumentToken
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
|
|
||||||
def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) -> str:
|
def remove_doctags_content(doctags: str, images: list[PILImage.Image]) -> str:
|
||||||
|
dt_list = (
|
||||||
|
doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>")
|
||||||
|
.removesuffix(f"\n</{DocumentToken.DOCUMENT.value}>")
|
||||||
|
.split(f"\n<{DocumentToken.PAGE_BREAK.value}>\n")
|
||||||
|
)
|
||||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
||||||
cast(list[str | Path], doctags), cast(list[PILImage.Image | Path], images)
|
cast(list[str | Path], dt_list), cast(list[PILImage.Image | Path], images)
|
||||||
)
|
)
|
||||||
doc = DoclingDocument(name="dummy")
|
doc = DoclingDocument(name="dummy")
|
||||||
doc.load_from_doctags(doctags_doc)
|
doc.load_from_doctags(doctags_doc)
|
||||||
@ -23,8 +29,7 @@ def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) ->
|
|||||||
doc.add_page(page_no=idx + 1, size=size)
|
doc.add_page(page_no=idx + 1, size=size)
|
||||||
dt_params = DocTagsParams(add_content=False)
|
dt_params = DocTagsParams(add_content=False)
|
||||||
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
||||||
page_items: dict[int, list[SerializationResult]]
|
page_items: dict[int, list[SerializationResult]] = {}
|
||||||
page_items = {}
|
|
||||||
for item, _ in doc.iterate_items():
|
for item, _ in doc.iterate_items():
|
||||||
if not isinstance(item, DocItem):
|
if not isinstance(item, DocItem):
|
||||||
continue
|
continue
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
from docling_core.types.doc.tokens import DocumentToken
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
from docling.utils.doctags_utils import remove_doctags_content
|
from docling.utils.doctags_utils import remove_doctags_content
|
||||||
@ -7,7 +8,15 @@ def test_remove_doctags_content():
|
|||||||
img = PILImage.open("./tests/data_scanned/ocr_test.png")
|
img = PILImage.open("./tests/data_scanned/ocr_test.png")
|
||||||
with open("./tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt") as f:
|
with open("./tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt") as f:
|
||||||
doctags = f.read()
|
doctags = f.read()
|
||||||
actual = remove_doctags_content([doctags] * 2, [img] * 2)
|
unwrapped = doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>").removesuffix(
|
||||||
|
f"\n</{DocumentToken.DOCUMENT.value}>"
|
||||||
|
)
|
||||||
|
rewrapped = (
|
||||||
|
f"<{DocumentToken.DOCUMENT.value}>"
|
||||||
|
+ f"\n<{DocumentToken.PAGE_BREAK.value}>\n".join([unwrapped] * 2)
|
||||||
|
+ f"\n</{DocumentToken.DOCUMENT.value}>"
|
||||||
|
)
|
||||||
|
actual = remove_doctags_content(rewrapped, [img] * 2)
|
||||||
expected = """<doctag><text><loc_58><loc_44><loc_426><loc_91></text>
|
expected = """<doctag><text><loc_58><loc_44><loc_426><loc_91></text>
|
||||||
<page_break>
|
<page_break>
|
||||||
<text><loc_58><loc_44><loc_426><loc_91></text>
|
<text><loc_58><loc_44><loc_426><loc_91></text>
|
||||||
|
Loading…
Reference in New Issue
Block a user