From 411a33fc69b4c76f5285cb9bf2f1085ad1492a2f Mon Sep 17 00:00:00 2001 From: Yusik Kim Date: Thu, 20 Mar 2025 13:23:48 +0100 Subject: [PATCH] fix: make it work for multi-page Signed-off-by: Yusik Kim --- docling/utils/doctags_utils.py | 53 ++++++++++------------------------ tests/test_doctags_utils.py | 8 +++-- 2 files changed, 21 insertions(+), 40 deletions(-) diff --git a/docling/utils/doctags_utils.py b/docling/utils/doctags_utils.py index e3836c4a..01a126b4 100644 --- a/docling/utils/doctags_utils.py +++ b/docling/utils/doctags_utils.py @@ -1,50 +1,27 @@ -import base64 -import io +from pathlib import Path +from typing import cast -from PIL import Image as PILImage from docling_core.experimental.serializer.doctags import ( DocTagsDocSerializer, DocTagsParams, ) from docling_core.types.doc import DoclingDocument, Size -from docling_core.types.doc.document import DocTagsDocument, ImageRef, PageItem -from pydantic import AnyUrl +from docling_core.types.doc.document import DocTagsDocument +from PIL import Image as PILImage -def remove_doctags_content(doctags: str, image: PILImage.Image) -> str: - def from_pil_to_base64(img: PILImage.Image) -> str: - # Convert the image to a base64 str - buffered = io.BytesIO() - img.save(buffered, format="PNG") # Specify the format (e.g., JPEG, PNG, etc.) - image_bytes = buffered.getvalue() - - # Encode the bytes to a Base64 string - image_base64 = base64.b64encode(image_bytes).decode("utf-8") - return image_base64 - - def from_pil_to_base64uri(img: PILImage.Image) -> AnyUrl: - image_base64 = from_pil_to_base64(img) - uri = AnyUrl(f"data:image/png;base64,{image_base64}") - - return uri - - doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) +def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) -> str: + doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( + cast(list[str | Path], doctags), cast(list[PILImage.Image | Path], images) + ) doc = DoclingDocument(name="dummy") doc.load_from_doctags(doctags_doc) - image_ref = ImageRef( - mimetype="image/png", - dpi=72, - size=Size(width=float(image.width), height=float(image.height)), - uri=from_pil_to_base64uri(image), - ) - page_item = PageItem( - page_no=1, - size=Size(width=float(image.width), height=float(image.height)), - image=image_ref, - ) - - doc.pages[1] = page_item + for idx, image in enumerate(images): + size = Size(width=float(image.width), height=float(image.height)) + doc.add_page(page_no=idx + 1, size=size) dt_params = DocTagsParams(add_content=False) ser = DocTagsDocSerializer(params=dt_params, doc=doc) - pages = [ser.serialize(item=item) for item, _ in doc.iterate_items()] - return ser.serialize_doc(pages=pages).text + items = [ser.serialize(item=item) for item, _ in doc.iterate_items()] + dt_params = DocTagsParams(add_content=False) + ser = DocTagsDocSerializer(params=dt_params, doc=doc) + return ser.serialize_doc(pages=items).text diff --git a/tests/test_doctags_utils.py b/tests/test_doctags_utils.py index 19a39d25..68f9a652 100644 --- a/tests/test_doctags_utils.py +++ b/tests/test_doctags_utils.py @@ -2,10 +2,14 @@ from PIL import Image as PILImage from docling.utils.doctags_utils import remove_doctags_content + def test_remove_doctags_content(): img = PILImage.open("./tests/data_scanned/ocr_test.png") with open("./tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt") as f: doctags = f.read() - actual = remove_doctags_content(doctags, img) - expected = "\n" + actual = remove_doctags_content([doctags] * 2, [img] * 2) + expected = """ + + +""" assert actual == expected