mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
feat: add function to remove content from DocTags
Signed-off-by: Yusik Kim <kmyusk@gmail.com>
This commit is contained in:
parent
7df157204b
commit
b52b5672c9
17
docling/utils/doctags_utils.py
Normal file
17
docling/utils/doctags_utils.py
Normal file
@ -0,0 +1,17 @@
|
||||
from docling_core.experimental.serializer.doctags import (
|
||||
DocTagsDocSerializer,
|
||||
DocTagsParams,
|
||||
)
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
from docling_core.types.doc.document import DocTagsDocument
|
||||
from PIL import Image as PILImage
|
||||
|
||||
|
||||
def remove_doctags_content(doctags: str, image: PILImage.Image) -> str:
|
||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
||||
doc = DoclingDocument(name="dummy")
|
||||
doc.load_from_doctags(doctags_doc)
|
||||
dt_params = DocTagsParams(add_content=False)
|
||||
ser = DocTagsDocSerializer(params=dt_params, doc=doc)
|
||||
pages = [ser.serialize(item=item) for item, _ in doc.iterate_items()]
|
||||
return ser.serialize_doc(pages=pages).text
|
734
poetry.lock
generated
734
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -46,7 +46,8 @@ packages = [{ include = "docling" }]
|
||||
######################
|
||||
python = "^3.9"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = {extras = ["chunking"], version = "^2.23.1"}
|
||||
docling-core = { git = "git@github.com:docling-project/docling-core.git", extras = ["chunking"], branch = "add-doctags-serializer"}
|
||||
#docling-core = { extras = ["chunking"], version = "^2.23.1" }
|
||||
docling-ibm-models = "^3.4.0"
|
||||
docling-parse = "^4.0.0"
|
||||
filetype = "^1.2.0"
|
||||
|
Loading…
Reference in New Issue
Block a user