mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
don't use raw doctags serializer
Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
parent
3a09ca50bb
commit
b7fc13f3c4
@ -1,13 +1,8 @@
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from docling_core.experimental.serializer.base import SerializationResult
|
||||
from docling_core.experimental.serializer.doctags import (
|
||||
DocTagsDocSerializer,
|
||||
DocTagsParams,
|
||||
)
|
||||
from docling_core.types.doc import DoclingDocument, Size
|
||||
from docling_core.types.doc.document import DocItem, DocTagsDocument
|
||||
from docling_core.types.doc.document import DocTagsDocument
|
||||
from docling_core.types.doc.tokens import DocumentToken
|
||||
from PIL import Image as PILImage
|
||||
|
||||
@ -27,18 +22,5 @@ def remove_doctags_content(doctags: str, images: list[PILImage.Image]) -> str:
|
||||
for idx, image in enumerate(images):
|
||||
size = Size(width=float(image.width), height=float(image.height))
|
||||
doc.add_page(page_no=idx + 1, size=size)
|
||||
dt_params = DocTagsParams(add_content=False)
|
||||
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
||||
page_items: dict[int, list[SerializationResult]] = {}
|
||||
for item, _ in doc.iterate_items():
|
||||
if not isinstance(item, DocItem):
|
||||
continue
|
||||
page_no = cast(DocItem, item).prov[0].page_no
|
||||
if page_no in page_items:
|
||||
page_items[page_no].append(ser.serialize(item=item))
|
||||
else:
|
||||
page_items[page_no] = [ser.serialize(item=item)]
|
||||
sorted_items = [page_items[key] for key in sorted(page_items.keys())]
|
||||
pages = [ser.serialize_page(parts=parts) for parts in sorted_items]
|
||||
|
||||
return ser.serialize_doc(pages=pages).text
|
||||
return doc.export_to_document_tokens(add_content=False)
|
||||
|
734
poetry.lock
generated
734
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -46,8 +46,7 @@ packages = [{ include = "docling" }]
|
||||
######################
|
||||
python = "^3.9"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = { git = "git@github.com:docling-project/docling-core.git", extras = ["chunking"], branch = "add-doctags-serializer"}
|
||||
#docling-core = { extras = ["chunking"], version = "^2.23.1" }
|
||||
docling-core = {extras = ["chunking"], version = "^2.23.1"}
|
||||
docling-ibm-models = "^3.4.0"
|
||||
docling-parse = "^4.0.0"
|
||||
filetype = "^1.2.0"
|
||||
|
Loading…
Reference in New Issue
Block a user