don't use raw doctags serializer

Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
Yusik Kim 2025-03-25 10:23:53 +01:00
parent 3a09ca50bb
commit b7fc13f3c4
3 changed files with 69 additions and 690 deletions

View File

@ -1,13 +1,8 @@
from pathlib import Path from pathlib import Path
from typing import cast from typing import cast
from docling_core.experimental.serializer.base import SerializationResult
from docling_core.experimental.serializer.doctags import (
DocTagsDocSerializer,
DocTagsParams,
)
from docling_core.types.doc import DoclingDocument, Size from docling_core.types.doc import DoclingDocument, Size
from docling_core.types.doc.document import DocItem, DocTagsDocument from docling_core.types.doc.document import DocTagsDocument
from docling_core.types.doc.tokens import DocumentToken from docling_core.types.doc.tokens import DocumentToken
from PIL import Image as PILImage from PIL import Image as PILImage
@ -27,18 +22,5 @@ def remove_doctags_content(doctags: str, images: list[PILImage.Image]) -> str:
for idx, image in enumerate(images): for idx, image in enumerate(images):
size = Size(width=float(image.width), height=float(image.height)) size = Size(width=float(image.width), height=float(image.height))
doc.add_page(page_no=idx + 1, size=size) doc.add_page(page_no=idx + 1, size=size)
dt_params = DocTagsParams(add_content=False)
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
page_items: dict[int, list[SerializationResult]] = {}
for item, _ in doc.iterate_items():
if not isinstance(item, DocItem):
continue
page_no = cast(DocItem, item).prov[0].page_no
if page_no in page_items:
page_items[page_no].append(ser.serialize(item=item))
else:
page_items[page_no] = [ser.serialize(item=item)]
sorted_items = [page_items[key] for key in sorted(page_items.keys())]
pages = [ser.serialize_page(parts=parts) for parts in sorted_items]
return ser.serialize_doc(pages=pages).text return doc.export_to_document_tokens(add_content=False)

734
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -46,8 +46,7 @@ packages = [{ include = "docling" }]
###################### ######################
python = "^3.9" python = "^3.9"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = { git = "git@github.com:docling-project/docling-core.git", extras = ["chunking"], branch = "add-doctags-serializer"} docling-core = {extras = ["chunking"], version = "^2.23.1"}
#docling-core = { extras = ["chunking"], version = "^2.23.1" }
docling-ibm-models = "^3.4.0" docling-ibm-models = "^3.4.0"
docling-parse = "^4.0.0" docling-parse = "^4.0.0"
filetype = "^1.2.0" filetype = "^1.2.0"