mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
fix: properly serialize per page
Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
parent
303b77f03d
commit
f6892c7877
@ -1,12 +1,13 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
|
from docling_core.experimental.serializer.base import SerializationResult
|
||||||
from docling_core.experimental.serializer.doctags import (
|
from docling_core.experimental.serializer.doctags import (
|
||||||
DocTagsDocSerializer,
|
DocTagsDocSerializer,
|
||||||
DocTagsParams,
|
DocTagsParams,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc import DoclingDocument, Size
|
from docling_core.types.doc import DoclingDocument, Size
|
||||||
from docling_core.types.doc.document import DocTagsDocument
|
from docling_core.types.doc.document import DocItem, DocTagsDocument
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
|
|
||||||
@ -16,10 +17,22 @@ def remove_doctags_content(doctags: list[str], images: list[PILImage.Image]) ->
|
|||||||
)
|
)
|
||||||
doc = DoclingDocument(name="dummy")
|
doc = DoclingDocument(name="dummy")
|
||||||
doc.load_from_doctags(doctags_doc)
|
doc.load_from_doctags(doctags_doc)
|
||||||
|
|
||||||
for idx, image in enumerate(images):
|
for idx, image in enumerate(images):
|
||||||
size = Size(width=float(image.width), height=float(image.height))
|
size = Size(width=float(image.width), height=float(image.height))
|
||||||
doc.add_page(page_no=idx + 1, size=size)
|
doc.add_page(page_no=idx + 1, size=size)
|
||||||
dt_params = DocTagsParams(add_content=False)
|
dt_params = DocTagsParams(add_content=False)
|
||||||
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
||||||
items = [ser.serialize(item=item) for item, _ in doc.iterate_items()]
|
page_items: dict[int, list[SerializationResult]]
|
||||||
return ser.serialize_doc(pages=items).text
|
page_items = {}
|
||||||
|
for item, _ in doc.iterate_items():
|
||||||
|
if not isinstance(item, DocItem):
|
||||||
|
continue
|
||||||
|
page_no = cast(DocItem, item).prov[0].page_no
|
||||||
|
if page_no in page_items:
|
||||||
|
page_items[page_no].append(ser.serialize(item=item))
|
||||||
|
else:
|
||||||
|
page_items[page_no] = [ser.serialize(item=item)]
|
||||||
|
pages = [ser.serialize_page(parts=parts) for parts in page_items.values()]
|
||||||
|
|
||||||
|
return ser.serialize_doc(pages=pages).text
|
||||||
|
Loading…
Reference in New Issue
Block a user