fix: properly serialize per page

Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
Yusik Kim 2025-03-20 15:18:33 +01:00
parent 303b77f03d
commit f6892c7877

View File

@ -1,12 +1,13 @@
from pathlib import Path from pathlib import Path
from typing import cast from typing import cast
from docling_core.experimental.serializer.base import SerializationResult
from docling_core.experimental.serializer.doctags import ( from docling_core.experimental.serializer.doctags import (
DocTagsDocSerializer, DocTagsDocSerializer,
DocTagsParams, DocTagsParams,
) )
from docling_core.types.doc import DoclingDocument, Size from docling_core.types.doc import DoclingDocument, Size
from docling_core.types.doc.document import DocTagsDocument from docling_core.types.doc.document import DocItem, DocTagsDocument
from PIL import Image as PILImage from PIL import Image as PILImage
@ -16,10 +17,22 @@ def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) ->
) )
doc = DoclingDocument(name="dummy") doc = DoclingDocument(name="dummy")
doc.load_from_doctags(doctags_doc) doc.load_from_doctags(doctags_doc)
for idx, image in enumerate(images): for idx, image in enumerate(images):
size = Size(width=float(image.width), height=float(image.height)) size = Size(width=float(image.width), height=float(image.height))
doc.add_page(page_no=idx + 1, size=size) doc.add_page(page_no=idx + 1, size=size)
dt_params = DocTagsParams(add_content=False) dt_params = DocTagsParams(add_content=False)
ser = DocTagsDocSerializer(params=dt_params, doc=doc) ser = DocTagsDocSerializer(params=dt_params, doc=doc)
items = [ser.serialize(item=item) for item, _ in doc.iterate_items()] page_items: dict[int, list[SerializationResult]]
return ser.serialize_doc(pages=items).text page_items = {}
for item, _ in doc.iterate_items():
if not isinstance(item, DocItem):
continue
page_no = cast(DocItem, item).prov[0].page_no
if page_no in page_items:
page_items[page_no].append(ser.serialize(item=item))
else:
page_items[page_no] = [ser.serialize(item=item)]
pages = [ser.serialize_page(parts=parts) for parts in page_items.values()]
return ser.serialize_doc(pages=pages).text