mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
don't use raw doctags serializer
Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
parent
3a09ca50bb
commit
b7fc13f3c4
@ -1,13 +1,8 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
from docling_core.experimental.serializer.base import SerializationResult
|
|
||||||
from docling_core.experimental.serializer.doctags import (
|
|
||||||
DocTagsDocSerializer,
|
|
||||||
DocTagsParams,
|
|
||||||
)
|
|
||||||
from docling_core.types.doc import DoclingDocument, Size
|
from docling_core.types.doc import DoclingDocument, Size
|
||||||
from docling_core.types.doc.document import DocItem, DocTagsDocument
|
from docling_core.types.doc.document import DocTagsDocument
|
||||||
from docling_core.types.doc.tokens import DocumentToken
|
from docling_core.types.doc.tokens import DocumentToken
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
@ -27,18 +22,5 @@ def remove_doctags_content(doctags: str, images: list[PILImage.Image]) -> str:
|
|||||||
for idx, image in enumerate(images):
|
for idx, image in enumerate(images):
|
||||||
size = Size(width=float(image.width), height=float(image.height))
|
size = Size(width=float(image.width), height=float(image.height))
|
||||||
doc.add_page(page_no=idx + 1, size=size)
|
doc.add_page(page_no=idx + 1, size=size)
|
||||||
dt_params = DocTagsParams(add_content=False)
|
|
||||||
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
|
||||||
page_items: dict[int, list[SerializationResult]] = {}
|
|
||||||
for item, _ in doc.iterate_items():
|
|
||||||
if not isinstance(item, DocItem):
|
|
||||||
continue
|
|
||||||
page_no = cast(DocItem, item).prov[0].page_no
|
|
||||||
if page_no in page_items:
|
|
||||||
page_items[page_no].append(ser.serialize(item=item))
|
|
||||||
else:
|
|
||||||
page_items[page_no] = [ser.serialize(item=item)]
|
|
||||||
sorted_items = [page_items[key] for key in sorted(page_items.keys())]
|
|
||||||
pages = [ser.serialize_page(parts=parts) for parts in sorted_items]
|
|
||||||
|
|
||||||
return ser.serialize_doc(pages=pages).text
|
return doc.export_to_document_tokens(add_content=False)
|
||||||
|
734
poetry.lock
generated
734
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -46,8 +46,7 @@ packages = [{ include = "docling" }]
|
|||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = { git = "git@github.com:docling-project/docling-core.git", extras = ["chunking"], branch = "add-doctags-serializer"}
|
docling-core = {extras = ["chunking"], version = "^2.23.1"}
|
||||||
#docling-core = { extras = ["chunking"], version = "^2.23.1" }
|
|
||||||
docling-ibm-models = "^3.4.0"
|
docling-ibm-models = "^3.4.0"
|
||||||
docling-parse = "^4.0.0"
|
docling-parse = "^4.0.0"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
|
Loading…
Reference in New Issue
Block a user