From f6892c78775048afac92788d353327c0d54c4a14 Mon Sep 17 00:00:00 2001 From: Yusik Kim Date: Thu, 20 Mar 2025 15:18:33 +0100 Subject: [PATCH] fix: properly serialize per page Signed-off-by: Yusik Kim --- docling/utils/doctags_utils.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/docling/utils/doctags_utils.py b/docling/utils/doctags_utils.py index 06026374..1ff7159c 100644 --- a/docling/utils/doctags_utils.py +++ b/docling/utils/doctags_utils.py @@ -1,12 +1,13 @@ from pathlib import Path from typing import cast +from docling_core.experimental.serializer.base import SerializationResult from docling_core.experimental.serializer.doctags import ( DocTagsDocSerializer, DocTagsParams, ) from docling_core.types.doc import DoclingDocument, Size -from docling_core.types.doc.document import DocTagsDocument +from docling_core.types.doc.document import DocItem, DocTagsDocument from PIL import Image as PILImage @@ -16,10 +17,22 @@ def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) -> ) doc = DoclingDocument(name="dummy") doc.load_from_doctags(doctags_doc) + for idx, image in enumerate(images): size = Size(width=float(image.width), height=float(image.height)) doc.add_page(page_no=idx + 1, size=size) dt_params = DocTagsParams(add_content=False) ser = DocTagsDocSerializer(params=dt_params, doc=doc) - items = [ser.serialize(item=item) for item, _ in doc.iterate_items()] - return ser.serialize_doc(pages=items).text + page_items: dict[int, list[SerializationResult]] + page_items = {} + for item, _ in doc.iterate_items(): + if not isinstance(item, DocItem): + continue + page_no = cast(DocItem, item).prov[0].page_no + if page_no in page_items: + page_items[page_no].append(ser.serialize(item=item)) + else: + page_items[page_no] = [ser.serialize(item=item)] + pages = [ser.serialize_page(parts=parts) for parts in page_items.values()] + + return ser.serialize_doc(pages=pages).text