diff --git a/docling/utils/doctags_utils.py b/docling/utils/doctags_utils.py
index e3836c4a..01a126b4 100644
--- a/docling/utils/doctags_utils.py
+++ b/docling/utils/doctags_utils.py
@@ -1,50 +1,27 @@
-import base64
-import io
+from pathlib import Path
+from typing import cast
-from PIL import Image as PILImage
from docling_core.experimental.serializer.doctags import (
DocTagsDocSerializer,
DocTagsParams,
)
from docling_core.types.doc import DoclingDocument, Size
-from docling_core.types.doc.document import DocTagsDocument, ImageRef, PageItem
-from pydantic import AnyUrl
+from docling_core.types.doc.document import DocTagsDocument
+from PIL import Image as PILImage
-def remove_doctags_content(doctags: str, image: PILImage.Image) -> str:
- def from_pil_to_base64(img: PILImage.Image) -> str:
- # Convert the image to a base64 str
- buffered = io.BytesIO()
- img.save(buffered, format="PNG") # Specify the format (e.g., JPEG, PNG, etc.)
- image_bytes = buffered.getvalue()
-
- # Encode the bytes to a Base64 string
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
- return image_base64
-
- def from_pil_to_base64uri(img: PILImage.Image) -> AnyUrl:
- image_base64 = from_pil_to_base64(img)
- uri = AnyUrl(f"data:image/png;base64,{image_base64}")
-
- return uri
-
- doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
+def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) -> str:
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
+ cast(list[str | Path], doctags), cast(list[PILImage.Image | Path], images)
+ )
doc = DoclingDocument(name="dummy")
doc.load_from_doctags(doctags_doc)
- image_ref = ImageRef(
- mimetype="image/png",
- dpi=72,
- size=Size(width=float(image.width), height=float(image.height)),
- uri=from_pil_to_base64uri(image),
- )
- page_item = PageItem(
- page_no=1,
- size=Size(width=float(image.width), height=float(image.height)),
- image=image_ref,
- )
-
- doc.pages[1] = page_item
+ for idx, image in enumerate(images):
+ size = Size(width=float(image.width), height=float(image.height))
+ doc.add_page(page_no=idx + 1, size=size)
dt_params = DocTagsParams(add_content=False)
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
- pages = [ser.serialize(item=item) for item, _ in doc.iterate_items()]
- return ser.serialize_doc(pages=pages).text
+ items = [ser.serialize(item=item) for item, _ in doc.iterate_items()]
+ dt_params = DocTagsParams(add_content=False)
+ ser = DocTagsDocSerializer(params=dt_params, doc=doc)
+ return ser.serialize_doc(pages=items).text
diff --git a/tests/test_doctags_utils.py b/tests/test_doctags_utils.py
index 19a39d25..68f9a652 100644
--- a/tests/test_doctags_utils.py
+++ b/tests/test_doctags_utils.py
@@ -2,10 +2,14 @@ from PIL import Image as PILImage
from docling.utils.doctags_utils import remove_doctags_content
+
def test_remove_doctags_content():
img = PILImage.open("./tests/data_scanned/ocr_test.png")
with open("./tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt") as f:
doctags = f.read()
- actual = remove_doctags_content(doctags, img)
- expected = "\n"
+ actual = remove_doctags_content([doctags] * 2, [img] * 2)
+ expected = """
+
+
+"""
assert actual == expected